爬虫3 css选择器和xpath选择器, selenium的使用, 爬取京东商品信息

1 css选择器和xpath选择器

# css选择器
#######
#1 css选择器
#######
# 重点

# Tag对象.select("css选择器")
#  #ID号
#  .类名
#   div>p：儿子 和div p：子子孙孙
#   找div下最后一个a标签 div a:last-child


# css选择器，xpath选择器会用了，它就是个通行证（所有的都可以不会，会粘贴就行）

# bs4：自己的选择器，css选择器
# lxml：css选择器，xpath选择器
# selenium：自己的选择器，css选择器，xpath选择器
# scrapy框架：自己的选择器，css选择器，xpath选择器
# #select('.article')



#该模块提供了select方法来支持css,详见官网:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id37
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
    <b>The Dormouse's story</b>
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" >
        <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" >Lacie</a> and
    <a href="http://example.com/tillie" class="sister" >Tillie</a>;
    <div class='panel-1'>
        <ul class='list' id='list-1'>
            <li class='element'>Foo</li>
            <li class='element'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
        <ul class='list list-small' id='list-2'>
            <li class='element'><h1 class='yyyy'>Foo</h1></li>
            <li class='element xxx'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
    </div>
    and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'lxml')


#1、CSS选择器（前端学的css选择）
print(soup.p.select('.sister'))
print(soup.select('.sister span'))

print(soup.select('#link1'))
print(soup.select('#link1 span'))

print(soup.select('#list-2 .element.xxx'))

print(soup.select('#list-2')[0].select('.element')) #可以一直select,但其实没必要,一条select就可以了

# 2、获取属性
print(soup.select('#list-2 h1')[0].attrs)

# 3、获取内容
print(soup.select('#list-2 h1')[0].get_text())

# xpath选择
# / 从根节点选取  /a   从根节点开始，往下找a标签（子）
# //从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置  //a 从根节点开始找a标签（子子孙孙中所有a）
# .     选取当前节点。
# ..     选取当前节点的父节点。
# @     选取属性。


########
# 2 xpath选择器
########


# XPath 是一门在 XML 文档中查找信息的语言

# xpath选择
# / 从根节点选取  /a   从根节点开始，往下找a标签（子）
# //从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置  //a 从根节点开始找a标签（子子孙孙中所有a）
# 取值 /text()
# 取属性 /@属性名


# //*[@>
# //ul[1]
# //*[@]/div[1]/ul/li[3]/h2

# #focus-1 > div.focusimg-pic > ul > li:nth-child(3) > h2
doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' >Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <h5>test</h5>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''


from lxml import etree
html=etree.HTML(doc)  # 传字符串
# html=etree.parse('search.html',etree.HTMLParser())  # 文件

# 1 所有节点
# a=html.xpath('//*')
# 2 指定节点（结果为列表）
# a=html.xpath('//head')

# 3 子节点，子孙节点
# a=html.xpath('//div/a')

# a=html.xpath('//body/a') #无数据
# a=html.xpath('//body//a')

# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a[@href="image1.html"]')
# a=html.xpath('//body//a[1]/..')
# 也可以这样
# a=html.xpath('//body//a[1]/parent::*')


# 5 属性匹配
# a=html.xpath('//body//a[@href="image1.html"]')

# 6 文本获取   标签后加：/text() ********重点
# a=html.xpath('//body//a[@href="image1.html"]/text()')
# a=html.xpath('//body//a/text()')

# 7 属性获取  标签后：/@href   ********重点
# a=html.xpath('//body//a/@href')
# # 注意从1 开始取（不是从0）
# a=html.xpath('//body//a[3]/@href')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath('//body//a[@class="li"]')
# a=html.xpath('//body//a[@href="image1.html"]')
# a=html.xpath('//body//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/@name')



# 9 多属性匹配 or 和 and （了解）
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')


# 10 按序选择
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# 取最后一个（了解）
# a=html.xpath('//a[last()]/@href')
# a=html.xpath('//a[last()]/text()')
# 位置小于3的(第一,第二个)
# a=html.xpath('//a[position()<3]/@href')
# a=html.xpath('//a[position()<3]/text()')
# 倒数第二个(last往上面2个)
# a=html.xpath('//a[last()-2]/@href')


# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath('//a/ancestor::*')


# # 获取祖先节点中的div
# a=html.xpath('//a/ancestor::div')
# a=html.xpath('//a/ancestor::div/a[2]/text()')
# attribute：属性值
# a=html.xpath('//a[1]/attribute::*')
# a=html.xpath('//a[1]/@href')
# child：直接子节点
# a=html.xpath('//a[1]/child::*')
# a=html.xpath('//a[1]/img/@src')
# descendant：所有子孙节点
# a=html.xpath('//a[6]/descendant::*')

# following:当前节点之后所有节点(递归)   会有重复情况，子节点的子节点，孙子节点
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点（同级）
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')



print(a)

2 selenium的简单使用

#1  selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题
 -可以操作浏览器(火狐，谷歌（建议你用谷歌），ie)，模拟人的行为（人可以干啥，代码控制就可以干啥）
#2  测试开发和开发有什么区别


########
# 3 selenium的使用
#######
# pip3 install selenium

# 1 基本使用
from selenium import webdriver
import time
# # 得到 一个谷歌浏览器对象
# # 代码不能直接操作浏览器，需要有一个浏览器驱动（配套的）
# # 下载谷歌浏览器驱动：http://npm.taobao.org/mirrors/chromedriver/
# # 谷歌浏览器驱动要跟谷歌版本对应(最后一组数字接近就行)
# # http://npm.taobao.org/mirrors/chromedriver/80.0.3987.106/   ：80.0.3987.149（正式版本）
# # 指定一下驱动的位置（相对路径/绝对路径）
bro=webdriver.Chrome(executable_path='./chromedriver')
# bro=webdriver.Chrome() # chromedriver在当前文件已经在环境变量中，默认在环境变量里找，不加参数也能找到windows可以，mac不行

bro.get("https://www.baidu.com")
#
# # 页面内容
# # ret.text 相当于它，可以使用bs4解析数据，或者用selenium自带的解析器解析
print(bro.page_source)
time.sleep(5)
bro.close()

3 selenium的高级用法

# import time
# # 2 常用用法(在输入框中输入美女，搜索)
# bro=webdriver.Chrome(executable_path='./chromedriver')
#
# bro.get("https://www.baidu.com")
# # 在输入框中输入美女（自带的解析器，查找输入框空间）
# # 1、find_element_by_id  # id找
# # 2、find_element_by_link_text   # a标签上的文字找
# # 3、find_element_by_partial_link_text # a标签上的文字模糊
# # 4、find_element_by_tag_name        # 根据标签名字找
# # 5、find_element_by_class_name      # 根据类名字找
# # 6、find_element_by_name            # name='xx' 根据name属性找
# # 7、find_element_by_css_selector    # css选择器找
# # 8、find_element_by_xpath           #xpath选择器找
#
# # //*[@>
# # input_search=bro.find_element_by_xpath('//*[@>
# input_search=bro.find_element_by_css_selector('#kw')
#
# # 写文字
# input_search.send_keys("美女")
# # 查找搜索按钮
# enter=bro.find_element_by_id('su')
#
# time.sleep(3)
# # 点击按钮
# enter.click()
#
# time.sleep(5)
# bro.close()


# 3 小案例
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver')
# bro.get("https://www.baidu.com")
#
# # 隐士等待(最多等待10s)
# # 只有控件没有加载出来，才会等，控件一旦加载出来，直接就取到
# bro.implicitly_wait(10)
#
# submit_button=bro.find_element_by_link_text('登录')
# submit_button.click()
#
# user_button=bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
# user_button.click()
#
# user_input=bro.find_element_by_id('TANGRAM__PSP_10__userName')
# user_input.send_keys("ssssss@qq.com")
#
# pwd_input=bro.find_element_by_id('TANGRAM__PSP_10__password')
# pwd_input.send_keys("123456")
#
#
# submit_input=bro.find_element_by_id('TANGRAM__PSP_10__submit')
# submit_input.click()
#
# time.sleep(5)
# bro.close()


# 4 获取cookie
# 登陆之后，拿到cookie：就可以自己搭建cookie池（requests模块发请求，携带者cookie）
# # import time
# bro=webdriver.Chrome(executable_path='./chromedriver')
# bro.get("https://www.baidu.com")
# print(bro.get_cookies())
# bro.close()
#
# #搭建cookie池和代理池的作用是什么？封ip ，封账号（弄一堆小号，一堆cookie）

# 5 无界面浏览器（驱动谷歌，驱动其他浏览器）
# from selenium.webdriver.chrome.options import Options
# chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
# chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
# chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
# chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
#
# bro=webdriver.Chrome(executable_path='./chromedriver',options=chrome_options)
# bro.get("https://www.baidu.com")
# print(bro.get_cookies())   # 获得cookies
# bro.close()


# 6 获取标签属性
# (重点：获取属性)
# print(tag.get_attribute('src'))
# print(tag.get_attribute('href'))
#(重点：获取文本)
# print(tag.text)
#
# #获取标签ID，位置，名称，大小（了解）
# print(tag.id)
# print(tag.location)
# print(tag.tag_name)
# print(tag.size)


# 7 显示等待和隐士等待
# 隐士等待(最多等待10s)
# 只有控件没有加载出来，才会等，控件一旦加载出来，直接就取到
# bro.implicitly_wait(10)
# 显示等待（每个控件，都要写等待），不要使用

# 8 元素交互操作 点击click，清空clear，输入文字send_keys


#9 执行js
import time
# bro=webdriver.Chrome(executable_path='./chromedriver')
#
# bro.get("https://www.cnblogs.com")
# # 执行js代码
# # bro.execute_script('alert(1)') #此处如果不关消息窗，可能出现bro.close()情况，可用bro.quit()强制关闭
# # window.scrollTo(0,document.body.scrollHeight)     # window.scrollTo(0,100) 向下滑100
# bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')    # 滑动到底部
# time.sleep(5)
# bro.close()

# 10 模拟浏览器前进后推
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver')
#
# bro.get("https://www.cnblogs.com")
# time.sleep(1)
# bro.get("https://www.baidu.com")
# time.sleep(1)
# bro.get("https://www.jd.com")
#
# #退到上一个
# bro.back()
# time.sleep(1)
# # 前进一下
# bro.forward()
#
# time.sleep(5)
# bro.close()

# 10 选项卡管理(实际是用js实现的)
# import time
# from selenium import webdriver
#
# browser=webdriver.Chrome(executable_path='./chromedriver')
# browser.get('https://www.baidu.com')
# browser.execute_script('window.open()')    # 打开新的选项卡
#
# print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1]) # 切换到选项卡1
# browser.get('https://www.taobao.com')
# time.sleep(2)
# browser.switch_to_window(browser.window_handles[0]) # 切换到选项卡0
# browser.get('https://www.sina.com.cn')
# browser.close()
## browser.quit() # 测试中无法关闭，使用词句强制关闭

# 11 异常处理
# from selenium import webdriver
# from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
#
# try:
#     browser=webdriver.Chrome(executable_path='./chromedriver')
#     browser.get('http://www.baidu.com')
#     browser.find_element_by_id("xxx")
#
# except Exception as e:
#     print(e)
# finally:
#     browser.close()

4 爬取京东商品信息

########
# 爬取京东商品信息
#######
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
bro=webdriver.Chrome(executable_path='./chromedriver')



def get_goods(bro):
    # find_elements_by_class_name  找所有
    # find_element_by_class_name   找一个
    li_list=bro.find_elements_by_class_name('gl-item')
    # ul_list=bro.find_elements_by_css_selector('.gl-item')
    for li in li_list:
        url=li.find_element_by_css_selector('.p-img>a').get_attribute('href')
        url_img=li.find_element_by_css_selector('.p-img img').get_attribute("src")
        if not url_img:
            url_img='https:'+li.find_element_by_css_selector('.p-img img').get_attribute("data-lazy-img")
        price=li.find_element_by_css_selector('.p-price i').text
        name=li.find_element_by_css_selector('.p-name em').text
        commit=li.find_element_by_css_selector('.p-commit a').text

        print('''
        商品名字：%s
        商品价格：%s
        商品图片地址：%s
        商品地址：%s
        商品评论数：%s
        '''%(name,price,url,url_img,commit))

    #查找下一页按钮
    next=bro.find_element_by_partial_link_text('下一页')
    time.sleep(1)
    next.click()
    #继续抓取下一页
    get_goods(bro)

try:
    bro.get('https://www.jd.com')
    #隐士等待
    bro.implicitly_wait(10)
    input_search=bro.find_element_by_id('key')
    input_search.send_keys("精品内衣")
    #模拟键盘操作(模拟键盘敲回车)
    input_search.send_keys(Keys.ENTER)
    get_goods(bro)

except Exception as e:
    print(e)
finally:
    bro.close()

总结

# 1 css选择器： #id号   .类名   div p  div>p
# 2 xpath选择： /  //  @src   /text()    .   ..
# 3 selenium:自动化测试用，解决不能执行js代码，操作浏览器，模拟人的行为（可见即可爬）
# 4 浏览器驱动（版本，型号对应）
# 5 send_keys，click，clear，键盘操作，执行js（下拉屏幕），获取cookie，打开选项卡，前进后退
# 6 爬取京东商品信息（css选择器和xpath选择器）