前几天刚来头儿让爬个淘宝交易记录先看看,就用python写了个,我是分成两步爬的,首先是爬取商品链接,代码如下:

#-*- coding:utf-8 -*-

import BeautifulSoup
import urllib2
import json
import cookielib

class MyParser:
    def __init__(self,seedurl,destpath,stop_file_path):
        self.seedurl=seedurl
        self.stop_file_path=stop_file_path
        stop_file=open(stop_file_path,"rb")
        splits=stop_file.readline().split("\t")
        self.no_0=splits[0]         #stop文件里的值:初始为0
        self.no_1=splits[1]  #当前页第几个物品
        self.no_2=splits[2] #当前物品第几个记录
        self.destpath=destpath
    def  run(self):
        print self.no_0
        while int(self.no_0)<5*44:
            
            self.seedurl=self.seedurl+str(self.no_0)
            headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
            req=urllib2.Request(url=self.seedurl,headers=headers)
                
            content=urllib2.urlopen(req).read()
            contentsoup=BeautifulSoup.BeautifulSoup(content)
            
            items=contentsoup.findAll("div",{"class":"col title"})
            out_file=open(self.destpath,"a+")
            for item in items:
                print item.find("a")["href"]
                out_file.write(item.find("a")["href"]+"\n")
                out_file.flush()
            out_file.close()
            self.no_0=int(self.no_0)+44

        print "ok"
def run():
    seedurl="http://s.taobao.com/search?spm=a230r.1.8.15.5n02zF&refpid=420461_1006&tab=all&q=%C5%AE%D1%A9%B7%C4%C9%C0&style=list&bcoffset=-4&s="
    item_stop_file="e://item_stop_file"
    record_stop_file="s://record_stop_file"
    outFile="e://out"
    myParser=MyParser(seedurl,outFile,item_stop_file)
    myParser.run()
if __name__=="__main__":
    run()
    print "done!"

这样得到了输出文件e://out ,每行是一个商品的链接。

下面根据上面爬到的文件,爬取每个商品的交易记录,代码如下:

#-*- coding:utf-8 -*-
'''
Created on 2014��7��23��

@author: sj
'''
import re
import BeautifulSoup
import os
import urllib2

 
class MyParser:
    def __init__(self,item_path_file,stop_file,out_file):
        self.item_path_file=item_path_file
        self.stop_file=stop_file
        self.out_file=out_file
        stop_object=open(self.stop_file,"rb")
        splits=stop_object.readline().split("\t")
        stop_object.close()
        self.item=splits[0]
        self.page=splits[1]
        self.record=splits[2]
        self.tag=0
    def run(self):

        
            print self.item
            print self.page
            print self.record
            item_object=open(self.item_path_file,"rb")
            num_items=len(item_object.readlines())
            item_object.close()
            item_object=open(self.item_path_file,"rb")
            for line in item_object.readlines()[int(self.item):num_items]:
                try:
                    if re.search("tmall",line):
                        stop_object=open(self.stop_file,"rb")
                        item_new=stop_object.readline().split("\t")[0]
                        stop_object.close()
                        stop_object=open(self.stop_file,"wb")
                        stop_object.write(item_new+"\t"+"0"+"\t"+"0"+"\n")
                        stop_object.flush()
                        stop_object.close()
                        continue
                    print line
                    headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
                    req=urllib2.Request(url=line,headers=headers)
                    
                    content=urllib2.urlopen(req,timeout=3).read()
                    contentSoup=BeautifulSoup.BeautifulSoup(content)
                    
                    data_api=contentSoup.find("button",{"id":"J_listBuyerOnView"})["data-api"]           
                    parameters=data_api.split("?")[1]
                    stop_object=open(self.stop_file,"rb")
                    bid_page=stop_object.readline().split("\t")[1]
                    stop_object.close()
                    page_size=int(parameters.split("&")[2].split("=")[1])
        
                    while int(bid_page)<int(page_size):
                        print "没有超过pagesize的大小..."
                        print bid_page
                        if self.tag==1:
                            data_api=data_api.replace("bid_page="+str(bid_page),"bid_page="+str(int(bid_page)+1))
                        else:
                            data_api=data_api.replace("bid_page=1","bid_page="+str(int(bid_page)+1))
                        data_url=data_api+"&ua=006tpOWUuXBidH1MRWQZ0InIldyJ0J3AibxJg%3D%3D%7CtaBkcTQxVFHEsbQxBFEEIfY%3D%7CtJFV4sbweFGpcSkNye3Y7ckNKV7GLmae5976Lfo%3D%7Cs6aDR2N2MzZTVsO2szYjpsOmAwbil4KX4tei15LXgpeSh%2FLHQmax%7Csqcy9kFUkBUANfF0sJQ9VOM7Y%2BeTZUGWQQ%3D%3D%7CsSTgxOA3%7CsIVB9vM3Mvbj1pPGAmcSJ0KGk6bDxgJ3EpdTRnMWE9eihwLVAg%3D%3D%7Cv%2Fo%2Bia0L%2FGqyyuwU7KUtCc3o3Vic%2BZzJDVhtOA3aDQ%3D%3D%7CvusvmLyYXOuOy%2B4qrzpfm85L3jpvq767rmp%2Fau8rbjvsKC3pzektWB04vWq9%7Cvfj9%2BDw5%2FdgcCUxZnaj9iEw5XJitafw4LViP&t=1406097091097&callback=Hub.data.records_reload"
            
                        req=urllib2.Request(url=data_url,headers=headers)
                        datacontent=urllib2.urlopen(req,timeout=3).read()
                        datacontent=datacontent.decode("gbk").encode("utf-8")
                        self.deal(datacontent)
                        
                        bid_page=int(bid_page)+1
                        
                        stop_object=open(self.stop_file,"wb")
                        stop_object.write(str(self.item)+"\t"+str(bid_page)+"\t"+"0")
                        stop_object.flush()
                        stop_object.close()
                        self.tag=1
                        print self.item
                    if int(bid_page)>=page_size:
                        print "超过page_size大小,保存下一个物品的行数          0    0"
                        stop_object=open(self.stop_file,"wb")
                        stop_object.write(str(int(self.item)+1)+"\t0\t0\n")
                        stop_object.close()
                        self.item=int(self.item)+1
                except Exception as e:
                    if e=="timed out":
                        continue
            
    def deal(self,content):
        ls=[m.start() for m in re.finditer("\"",content)]
        content=content[(ls[0]+1):ls[-3]]
        contentSoup=BeautifulSoup.BeautifulSoup(content)
        recordshtml=contentSoup.find("tbody")
        if recordshtml==None:
            return 
        recordshtml=recordshtml.findAll("tr")
        for record in recordshtml:
            cols=record.findAll("td")
            if len(cols)!=5:
                continue
            name=cols[0].text
            price_em=cols[1].findAll("em")
            price=price_em[-1].text
            num=cols[2].text
            time=cols[3].text
            type=cols[4].text
            line=name+"\t"+price+"\t"+num+"\t"+time+"\t"+type+"\n"
            print line
            out_object=open(self.out_file,"a+")
            out_object.write(line)
            out_object.flush()
            out_object.close()
        print "ok"
def run():
    item_path_file="e:/item_path_file"
    stop_file="e://stop_file"
    out_file="e://records_file"
    parser=MyParser(item_path_file,stop_file,out_file)
    parser.run()
if __name__=="__main__":
    run()
    print "done~"

这里item_path_file 就是第一步爬取到的商品链接文件,stop_file用于记录爬取到的位置,其实不记录也可以,上面程序没有记录爬取失败数据文件。

注意,这里可能会爬取到天猫上的物品,但是天猫的交易记录和淘宝的格式不一样,所以这里直接过滤掉天猫的。

这次爬数据比之前进步的地方:

try except的使用,之前没有用,每次超时还要手动把程序停掉,然后再开启,从断点处爬,try except 的使用使得超时就跳过本链接,这样少了很多人工操作。

后来得知自己都是手动爬的,还有一种scrapy框架比较简单些。