因为之前的爬虫存在着各种不足,在此我们进行一些必要的扩展和改进。

一、加入代理服务器

首先,编写另外一个爬虫搜集网上的免费代理服务器

编写代理服务器数据爬虫程序”getproxy2.py”,代码如下:

  1 from bs4 import BeautifulSoup
  2 import urllib2
  3 from myLog import MyLog
  4 import csv
  5 import time
  6 import re
  7 
  8 class Item(object):
  9     IP = None  #IP地址
 10     port = None #端口
 11     type = None #类型
 12     address = None #地址
 13 
 14 class Get_proxy(object):
 15     def __init__(self):
 16         self.log = MyLog()
 17         self.log.info(u'Get_proxy 开始运行!')
 18         self.urls = self.get_urls()
 19         self.log.info(u'获取需要访问的url,共 %d 个' % len(self.urls))
 20         self.proxy_list = self.spider(self.urls)
 21         self.log.info(u'获取到代理服务器地址,共 %d 个' % len(self.proxy_list))
 22         self.alivelist = self.testproxy(self.proxy_list)
 23         self.pipelines(self.alivelist)
 24         self.log.info(u'Get_proxy 运行结束!')
 25     
 26     def get_urls(self):
 27         urls = []
 28         num_max = 20
 29         for n in range(1,num_max+1):
 30             url = 'http://www.xicidaili.com/wn/'+str(n)
 31             urls.append(url)
 32         return urls
 33     
 34     def getresponsecontent(self,url):
 35         try:
 36             Headers = {"User-Agent":"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"}
 37             request = urllib2.Request(url.encode('utf8'),headers = Headers)
 38             response = urllib2.urlopen(request)
 39         except:
 40             self.log.error(u'返回 URL: %s 数据失败' % url)
 41             return ''
 42         else:
 43             self.log.info(u'返回URL: %s 数据成功' % url)
 44             return response
 45     
 46     def spider(self,urls):
 47         items = []
 48         for url in urls:
 49             time.sleep(10)
 50             htmlcontent = self.getresponsecontent(url)
 51             if htmlcontent == '':
 52                 continue
 53             soup = BeautifulSoup(htmlcontent,'lxml')
 54             proxys = soup.find_all('tr',attrs={'class':'odd'})
 55             for proxy in proxys:
 56                 item = Item()
 57                 elements = proxy.find_all('td')
 58                 item.IP = elements[1].get_text().strip()
 59                 item.port = elements[2].get_text().strip()
 60                 item.address = elements[3].get_text().strip()
 61                 item.type = elements[5].get_text().strip()
 62                 items.append(item)
 63             
 64         return items
 65     
 66     def testproxy(self,proxylist):
 67         self.log.info(u'开始对获取到的代理服务器进行测试 ...')
 68         aliveList = []
 69         ip_list = []
 70         URL = r'http://www.china-yao.com/'
 71         regex = re.compile(r'china-yao.com')
 72         for proxy in proxylist:
 73             if proxy.IP in ip_list:
 74                 continue   #去除列表中重复的代理服务器
 75             server = proxy.type.lower() + r'://' + proxy.IP + ':' + proxy.port
 76             self.log.info(u'开始测试 %s' % server)
 77             opener = urllib2.build_opener(urllib2.ProxyHandler({proxy.type.lower():server}))
 78             urllib2.install_opener(opener)
 79             try:
 80                 response = urllib2.urlopen(URL,timeout=3)
 81             except:
 82                 self.log.info(u'%s 连接失败' % server)
 83                 continue
 84             else:
 85                 try:
 86                     string = response.read()
 87                 except:
 88                     self.log.info(u'%s 连接失败' % server)
 89                     continue
 90                 if regex.search(string):
 91                     self.log.info(u'%s 连接成功 .......' % server)
 92                     ip_list.append(proxy.IP)
 93                     aliveList.append(proxy)
 94         return aliveList
 95             
 96     
 97     def pipelines(self,alivelist):
 98         filename = 'proxylist.csv'
 99         self.log.info(u'准备将获取到的代理服务器地址保存数据到csv文件中...')
100         writer = csv.writer(file(filename,'wb'))
101         #writer.writerow([u'IP地址'.encode('utf8'),u'端口'.encode('utf8'),u'类型'.encode('utf8'),u'地址'.encode('utf8')])
102         for aliveproxy in alivelist:
103             writer.writerow([aliveproxy.IP.encode('utf8'),aliveproxy.port.encode('utf8'),aliveproxy.type.encode('utf8'),aliveproxy.address.encode('utf8')])
104         self.log.info(u'数据保存完毕!')
105 
106 if __name__ == '__main__':
107     Get_proxy()
108     

View Code