一般爬虫可以分为以下几个步骤:

一、打开指定网页

二、解析网页

三、处理/存储数据,新增任务网页

另外异步的话,需要调度器。

简单爬虫的话,不需要搞复杂验证码,requests/urllib修改cookie,header就能访问的话,写一个打开,一个解析就够了,处理数据和新任务,直接写在解析类就下,gevent也可以直接异步。

 

项目路径:ur'D:\python_py\my_scrapy/scrapy_tools'

# scrapy_tools下添加__init__.py作为包使用

itemparse.py

按照数据的结构建立相应的xpath 结构

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 07 17:24:34 2017

@author: willowj
"""
import sys
stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr
reload(sys)
sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr
sys.setdefaultencoding('utf8')


import gevent
import pandas as pd 
import numpy as np
from lxml import html
import time
import codecs
import json


def list_0e(list_):
    if isinstance(list_, list):
        if not list_:
            return None
        else:
            if len(list_)>1:
                print 'warning : list>1,list[1]:', list_[1] #,len(list_)
            return list_[0]
    else:
        return list_


class ItemParse(object):
    """docstring for zhihu_topi"""
    name = 'ItemParse'

    base_url = 'https://www.zhihu.com/topic/19551147/top-answers'
    pageN_x = '//div[@class="zm-invite-pager"]//span[last()-1]/a/text()'
    new_urls_x = None

    #以下一条数据的节点,以及每一项
    items_node_x = '//div[@class="feed-main"]'
    #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头
    item_xs = dict(
        question_name = '''.//a[@class='question_link']/text()''', 
        #question_href = '''.//a[@class='question_link']/@href''', 
        author = './/div[@data-action="/answer/content"]/@data-author-name',
        author_href = '''.//a[@class='author-link']/@href''',  
        ups_x = './/div[@class="zm-item-vote-info"]/@data-votecount',
        answers_text = ".//textarea/text()",
        commentN = './/a[@name="addcomment"]/text()[last()]',
        entry_url = './/div[@data-action="/answer/content"]/@data-entry-url',

        #re:
        #z = re.compile('\.')
        )    
    
    #换页url样式
    def getnextpages(self):
        if self.pageN > 1:
        #自定义换也规则,只有一页则为 False
            urls = [self.base_url + '?page=%s' %n 
                        for n in range(self.pageN,1,-1)
                    ]
            return urls


    def __init__(self, html_):
        #self.item_atrr_xpath()
        self.results = []
        self.new_urls = []
        self.pageN = self.update_page_n(html_)
        self.nextpages = self.getnextpages()
        self.parase(html_)


    def parase(self, html_):
        #优先使用xpath,,补充使用re; 找不到的item 返回none
        etree = html.document_fromstring(html_)
        items_nodes = etree.xpath(self.items_node_x)
        #results = []
        for ee in items_nodes:
            ee_str = None
            ite = {}
            for item,itemx in self.item_xs.items():
                # re, or xpath
                if hasattr(itemx, 'findall'):
                    if ee_str is None:
                        ee_str = html.to_string(ee)
                    ite[item] = itemx.findall(ee_str)
                #xpath   
                elif isinstance(itemx, str) or isinstance(itemx, unicode): 
                    if itemx.startswith('./'):
                        ite[item] = ee.xpath(itemx)
                    else:
                        print item
                        raise 'xpath not startwith ./'
                else:
                    print item
                    raise 'not re.pattarn object or xpath str'
                
                if len(ite[item]) == 0:
                    ite[item] = None
                elif len(ite[item]) == 1:
                    ite[item] = ite[item][0]
                else:
                    ite[item] = '\n'.join([str(__i) for __i in ite[item]])
                
            self.results.append(ite)
        
        #new_url
        if self.new_urls_x:
            self.new_urls.extend(etree.xpath(self.new_urls_x)) 

    #获取有多少页
    def update_page_n(self, html_):
        if self.pageN_x:
            etree = html.document_fromstring(html_)
            pages = etree.xpath(self.pageN_x)
            pages = list_0e(pages)
            if isinstance(pages, str): 
                pages.strip()
            if pages and pages.isdigit():
                return int(pages)
        else:
            return 1

    #普通的获取项目下所有换页
    def get_nextpages(self, opener, sleep_sec=None):
        for url in self.nextpages:
            if sleep_sec:
                time.sleep(sleep_sec)
            #if not hasattr(opener, 'get')    
            _re = opener.get(url)
            print _re.status_code,  _re.url
            self.parase(_re.text)
            print time.time()
    #暂时把 异步控制和存储方法写到了这里
    #gevent 协程方法            
    def __gevent_get_nextpages(self, opener):
        print id(opener)
        while self.nextpages:
            #start_time = time.time()
            url = self.nextpages.pop()
            print gevent.getcurrent()
            zhihu_re = opener.get(url)
            #gevent.sleep(5)
            print zhihu_re.status_code,  url
            self.parase(zhihu_re.text) 
            print time.time()
    #gevent 协程方法
    def get_nextpages_by_gevent(self, opener_class, g_n=4):
        '''
        param:  opener_class : 创建网页打开器的类
                g_n: 协程数量,默认4个
        '''
        from gevent import monkey; monkey.patch_all()
          
        start_time = time.time()
        gs = [gevent.spawn(self.__gevent_get_nextpages, opener_class())
                for i in range(g_n)
                ]
        gevent.joinall(gs)    

        print time.time() - start_time 
        self.save_to_excel()

    def save_to_excel(self, path=None):  
        if path:  
            save_name = path
        else:     
            save_name = u''+ self.name \
                           + time.strftime('%Y%m%d_%H_%M', time.localtime()) \
                           + '.xlsx'
        print save_name
        result_pd = pd.DataFrame(self.results)
        print 'pd ok'
        result_pd.to_excel(u'' + save_name, encoding='gb18030')        
        print 'saved to ' + save_name


    def save_to_json(self, path=None):
        if path:  
            save_name = path
        else:     
            save_name = u''+ self.name \
                           + time.strftime('%Y%m%d_%H_%M', time.localtime()) \
                           + '.json'
        print save_name
        with codecs.open(save_name, 'w', encdoing='gb18030') as f:
            f.write(josn.dumps(self.results))
            
        print 'saved to '+ save_name

View Code