基于multiprocessing.dummy线程池的数据爬取

一、项目实例(测试异步与同步效率)

  flask创建服务器:

from flask import Flask
from time import sleep
app = Flask(__name__)

@app.route('/bobo')
def index1():
    sleep(2)
    return 'hello bobo!'
@app.route('/jay')
def index2():
    sleep(2)
    return 'hello jay!'
@app.route('/tom')
def index3():
    sleep(2)
    return 'hello tom!'
app.run()

  不使用线程池爬取,同步抓取:

start = time.time()
urls = [
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/jay',
    'http://127.0.0.1:5000/tom',
]
for url in urls:
    page_text = requests.get(url,headers=headers).text
    print(page_text)
    
print(time.time()-start)

打印测试结果:
hello bobo!
hello jay!
hello tom!
6.016878366470337

  通过线程池,multiprocessing应用:

import requests
import time
from multiprocessing.dummy import Pool #线程池模块

#必须只可以有一个参数
def my_requests(url):
    return requests.get(url=url,headers=headers).text


start = time.time()
urls = [
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/jay',
    'http://127.0.0.1:5000/tom',
]

pool = Pool(3)
#map:两个参数
#参数1:自定义的函数,必须只可以有一个参数
#参数2:列表or字典
#map的作用就是让参数1表示的自定义的函数异步处理参数2对应的列表或者字典中的元素
page_texes = pool.map(my_requests,urls)
print(page_texes)


print(time.time()-start)


# 打印结果
['hello bobo!', 'hello jay!', 'hello tom!']
2.0126171112060547

二、项目实例(梨视频)

  需求:爬取梨视频的视频信息,并计算其爬取数据的耗时,url:https://www.pearvideo.com/

  2.1 普通爬取:

import requests
import random
from lxml import etree
import re
from fake_useragent import UserAgent
#安装fake-useragent库:pip install fake-useragent url = 'http://www.pearvideo.com/category_1' #随机产生UA,如果报错则可以添加如下参数: #ua = UserAgent(verify_ssl=False,use_cache_server=False).random #禁用服务器缓存: #ua = UserAgent(use_cache_server=False) #不缓存数据: #ua = UserAgent(cache=False) #忽略ssl验证: #ua = UserAgent(verify_ssl=False) ua = UserAgent().random headers = { 'User-Agent':ua }
#获取首页页面数据 page_text = requests.get(url=url,headers=headers).text
#对获取的首页页面数据中的相关视频详情链接进行解析 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@>)
detail_urls
= [] for li in li_list: detail_url = 'http://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] title = li.xpath('.//div[@class="vervideo-title"]/text()')[0] detail_urls.append(detail_url)

# 视频详请url
for url in detail_urls: page_text = requests.get(url=url,headers=headers).text vedio_url = re.findall('srcUrl="(.*?)"',page_text,re.S)[0] data = requests.get(url=vedio_url,headers=headers).content fileName = str(random.randint(1,10000))+'.mp4' #随机生成视频文件名称 with open(fileName,'wb') as fp: fp.write(data) print(fileName+' is over')

  2.2 线程池爬取:

import requests
import random
from lxml import etree
import re
from fake_useragent import UserAgent
#安装fake-useragent库:pip install fake-useragent
#导入线程池模块
from multiprocessing.dummy import Pool
#实例化线程池对象
pool = Pool()
url = 'http://www.pearvideo.com/category_1'
#随机产生UA
ua = UserAgent().random
headers = {
    'User-Agent':ua
}
#获取首页页面数据
page_text = requests.get(url=url,headers=headers).text
#对获取的首页页面数据中的相关视频详情链接进行解析
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@>)

detail_urls = []#存储二级页面的url
for li in li_list:
    detail_url = 'http://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
    title = li.xpath('.//div[@class="vervideo-title"]/text()')[0]
    detail_urls.append(detail_url)
    
vedio_urls = []#存储视频的url
for url in detail_urls:
    page_text = requests.get(url=url,headers=headers).text
    vedio_url = re.findall('srcUrl="(.*?)"',page_text,re.S)[0]
    vedio_urls.append(vedio_url) 
#使用线程池进行视频数据下载    
func_request = lambda link:requests.get(url=link,headers=headers).content
video_data_list = pool.map(func_request,vedio_urls)
#使用线程池进行视频数据保存
func_saveData = lambda data:save(data)
pool.map(func_saveData,video_data_list)
def save(data):
    fileName = str(random.randint(1,10000))+'.mp4'
    with open(fileName,'wb') as fp:
        fp.write(data)
        print(fileName+'已存储')
        
pool.close()
pool.join()

三、项目实例(梨视频xpath与re)

from lxml import etree
import requests
import re
import os
from uuid import uuid4
import random

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36"
}

# ip代理
all_ips = []

# 代理精灵接口
api_url = "http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=
  0&qty=20&time=1&pro=&city=&port=1&format=html&ss=5&css=&dt=1&specialTxt=3&specialJson=
" page_text = requests.get(api_url,headers=headers).text tree = etree.HTML(page_text) # 获取ip值 ip_list = tree.xpath('//body//text()') # 循环获取,存入IP存储列表中 for ip in ip_list: ip_dict = {'https':ip} all_ips.append(ip_dict) # 汽车资讯url url = "https://www.pearvideo.com/category_31" page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath("//div[@class='category-top']/div/ul/li | //*[@id='categoryList']/li") for li in li_list: # 视频详情页 img_url = "https://www.pearvideo.com/"+li.xpath("./div/a/@href")[0] # print(img_url) video_page_text = requests.get(img_url, headers=headers).text # 获取视频标题 ex_title = 'class="video-tt-box".*?video-tt">(.*?)</h1>' pa = re.compile(ex_title, re.S) video_title = pa.findall(video_page_text)[0] # 正则匹配视频详细url ex = ',srcUrl="(.*?)",' video_src = re.findall(ex, video_page_text, re.S)[0] print(video_src) if not os.path.exists("lishipin"): os.mkdir("lishipin") # 视频存放路径 filename = f"{uuid4()}.mp4" file_path = "lishipin/"+filename video_content = requests.get(url=video_src, headers=headers, proxies=random.choice(all_ips)).content print(video_content) with open(file_path, "wb") as fp: fp.write(video_content) print(video_title, " 下载完成!!!")