环境搭建
- requests:获取数据
- lxml:解析数据
本次爬去糗事百科,爬取地址:http://www.qiushibaike.com/8hr/page/1/
python3 代码示例
import requests import threading from queue import Queue from lxml import etree import json class Thread_Crawl(threading.Thread): def __init__(self,threadName,pageQueue,dataQueue): super().__init__() self.threadName = threadName self.pageQueue = pageQueue self.dataQueue = dataQueue self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} def run(self): print("%s启动了" % self.threadName) while not self.pageQueue.empty(): url = self.pageQueue.get() res = requests.get(url, headers=self.headers).text if not self.dataQueue.full(): self.dataQueue.put(res) print("%s结束了" % self.threadName) class Thread_Parse(threading.Thread): def __init__(self,threadName,dataQueue,lock,file): super().__init__() self.threadName = threadName self.dataQueue = dataQueue self.lock = lock self.file = file def run(self): print("%s启动了"%self.threadName) while not self.dataQueue.empty(): html = etree.HTML(self.dataQueue.get()) self.parse(html) print("%s结束了" % self.threadName) def parse(self,html): print(html.text) node_list = html.xpath('//div[contains(@id, "qiushi_tag")]') for node in node_list: username = node.xpath('./div/a/@title')[0] image = node.xpath('.//div[@class="thumb"]//@src' ) # [0] content = node.xpath('.//div[@class="content"]/span')[0].text zan = node.xpath('.//i')[0].text comments = node.xpath('.//i')[1].text items = { "username" : username, "image" : image, "content" : content, "zan" : zan, "comments" : comments } with self.lock: self.file.write(json.dumps(items) + "\n") if __name__ == "__main__": pageQueue = Queue(11) dataQueue = Queue() lock = threading.Lock() file = open("data.json", "a") for x in range(1,3): url = 'http://www.qiushibaike.com/8hr/page/' + str(x) + '/' pageQueue.put(url) CrawlList = ['采集线程1','采集线程2','采集线程3'] for crawlName in CrawlList: thread = Thread_Crawl(crawlName,pageQueue,dataQueue) thread.start() thread.join() ParseList = ['解析线程1', '解析线程2', '解析线程3'] for parseName in ParseList: thread = Thread_Parse(parseName, dataQueue,lock,file) thread.start() thread.join() with lock: file.close()
View Code
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python 多线程爬虫 - Python技术站