一般爬虫可以分为以下几个步骤:
一、打开指定网页
二、解析网页
三、处理/存储数据,新增任务网页
另外异步的话,需要调度器。
简单爬虫的话,不需要搞复杂验证码,requests/urllib修改cookie,header就能访问的话,写一个打开,一个解析就够了,处理数据和新任务,直接写在解析类就下,gevent也可以直接异步。
项目路径:ur'D:\python_py\my_scrapy/scrapy_tools'
# scrapy_tools下添加__init__.py作为包使用
itemparse.py
按照数据的结构建立相应的xpath 结构
# -*- coding: utf-8 -*- """ Created on Fri Jul 07 17:24:34 2017 @author: willowj """ import sys stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr reload(sys) sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr sys.setdefaultencoding('utf8') import gevent import pandas as pd import numpy as np from lxml import html import time import codecs import json def list_0e(list_): if isinstance(list_, list): if not list_: return None else: if len(list_)>1: print 'warning : list>1,list[1]:', list_[1] #,len(list_) return list_[0] else: return list_ class ItemParse(object): """docstring for zhihu_topi""" name = 'ItemParse' base_url = 'https://www.zhihu.com/topic/19551147/top-answers' pageN_x = '//div[@class="zm-invite-pager"]//span[last()-1]/a/text()' new_urls_x = None #以下一条数据的节点,以及每一项 items_node_x = '//div[@class="feed-main"]' #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头 item_xs = dict( question_name = '''.//a[@class='question_link']/text()''', #question_href = '''.//a[@class='question_link']/@href''', author = './/div[@data-action="/answer/content"]/@data-author-name', author_href = '''.//a[@class='author-link']/@href''', ups_x = './/div[@class="zm-item-vote-info"]/@data-votecount', answers_text = ".//textarea/text()", commentN = './/a[@name="addcomment"]/text()[last()]', entry_url = './/div[@data-action="/answer/content"]/@data-entry-url', #re: #z = re.compile('\.') ) #换页url样式 def getnextpages(self): if self.pageN > 1: #自定义换也规则,只有一页则为 False urls = [self.base_url + '?page=%s' %n for n in range(self.pageN,1,-1) ] return urls def __init__(self, html_): #self.item_atrr_xpath() self.results = [] self.new_urls = [] self.pageN = self.update_page_n(html_) self.nextpages = self.getnextpages() self.parase(html_) def parase(self, html_): #优先使用xpath,,补充使用re; 找不到的item 返回none etree = html.document_fromstring(html_) items_nodes = etree.xpath(self.items_node_x) #results = [] for ee in items_nodes: ee_str = None ite = {} for item,itemx in self.item_xs.items(): # re, or xpath if hasattr(itemx, 'findall'): if ee_str is None: ee_str = html.to_string(ee) ite[item] = itemx.findall(ee_str) #xpath elif isinstance(itemx, str) or isinstance(itemx, unicode): if itemx.startswith('./'): ite[item] = ee.xpath(itemx) else: print item raise 'xpath not startwith ./' else: print item raise 'not re.pattarn object or xpath str' if len(ite[item]) == 0: ite[item] = None elif len(ite[item]) == 1: ite[item] = ite[item][0] else: ite[item] = '\n'.join([str(__i) for __i in ite[item]]) self.results.append(ite) #new_url if self.new_urls_x: self.new_urls.extend(etree.xpath(self.new_urls_x)) #获取有多少页 def update_page_n(self, html_): if self.pageN_x: etree = html.document_fromstring(html_) pages = etree.xpath(self.pageN_x) pages = list_0e(pages) if isinstance(pages, str): pages.strip() if pages and pages.isdigit(): return int(pages) else: return 1 #普通的获取项目下所有换页 def get_nextpages(self, opener, sleep_sec=None): for url in self.nextpages: if sleep_sec: time.sleep(sleep_sec) #if not hasattr(opener, 'get') _re = opener.get(url) print _re.status_code, _re.url self.parase(_re.text) print time.time() #暂时把 异步控制和存储方法写到了这里 #gevent 协程方法 def __gevent_get_nextpages(self, opener): print id(opener) while self.nextpages: #start_time = time.time() url = self.nextpages.pop() print gevent.getcurrent() zhihu_re = opener.get(url) #gevent.sleep(5) print zhihu_re.status_code, url self.parase(zhihu_re.text) print time.time() #gevent 协程方法 def get_nextpages_by_gevent(self, opener_class, g_n=4): ''' param: opener_class : 创建网页打开器的类 g_n: 协程数量,默认4个 ''' from gevent import monkey; monkey.patch_all() start_time = time.time() gs = [gevent.spawn(self.__gevent_get_nextpages, opener_class()) for i in range(g_n) ] gevent.joinall(gs) print time.time() - start_time self.save_to_excel() def save_to_excel(self, path=None): if path: save_name = path else: save_name = u''+ self.name \ + time.strftime('%Y%m%d_%H_%M', time.localtime()) \ + '.xlsx' print save_name result_pd = pd.DataFrame(self.results) print 'pd ok' result_pd.to_excel(u'' + save_name, encoding='gb18030') print 'saved to ' + save_name def save_to_json(self, path=None): if path: save_name = path else: save_name = u''+ self.name \ + time.strftime('%Y%m%d_%H_%M', time.localtime()) \ + '.json' print save_name with codecs.open(save_name, 'w', encdoing='gb18030') as f: f.write(josn.dumps(self.results)) print 'saved to '+ save_name
View Code
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python 自建爬虫复用简单框架(gevent异步) - Python技术站