因为之前的爬虫存在着各种不足,在此我们进行一些必要的扩展和改进。
一、加入代理服务器
首先,编写另外一个爬虫搜集网上的免费代理服务器
编写代理服务器数据爬虫程序”getproxy2.py”,代码如下:
1 from bs4 import BeautifulSoup 2 import urllib2 3 from myLog import MyLog 4 import csv 5 import time 6 import re 7 8 class Item(object): 9 IP = None #IP地址 10 port = None #端口 11 type = None #类型 12 address = None #地址 13 14 class Get_proxy(object): 15 def __init__(self): 16 self.log = MyLog() 17 self.log.info(u'Get_proxy 开始运行!') 18 self.urls = self.get_urls() 19 self.log.info(u'获取需要访问的url,共 %d 个' % len(self.urls)) 20 self.proxy_list = self.spider(self.urls) 21 self.log.info(u'获取到代理服务器地址,共 %d 个' % len(self.proxy_list)) 22 self.alivelist = self.testproxy(self.proxy_list) 23 self.pipelines(self.alivelist) 24 self.log.info(u'Get_proxy 运行结束!') 25 26 def get_urls(self): 27 urls = [] 28 num_max = 20 29 for n in range(1,num_max+1): 30 url = 'http://www.xicidaili.com/wn/'+str(n) 31 urls.append(url) 32 return urls 33 34 def getresponsecontent(self,url): 35 try: 36 Headers = {"User-Agent":"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"} 37 request = urllib2.Request(url.encode('utf8'),headers = Headers) 38 response = urllib2.urlopen(request) 39 except: 40 self.log.error(u'返回 URL: %s 数据失败' % url) 41 return '' 42 else: 43 self.log.info(u'返回URL: %s 数据成功' % url) 44 return response 45 46 def spider(self,urls): 47 items = [] 48 for url in urls: 49 time.sleep(10) 50 htmlcontent = self.getresponsecontent(url) 51 if htmlcontent == '': 52 continue 53 soup = BeautifulSoup(htmlcontent,'lxml') 54 proxys = soup.find_all('tr',attrs={'class':'odd'}) 55 for proxy in proxys: 56 item = Item() 57 elements = proxy.find_all('td') 58 item.IP = elements[1].get_text().strip() 59 item.port = elements[2].get_text().strip() 60 item.address = elements[3].get_text().strip() 61 item.type = elements[5].get_text().strip() 62 items.append(item) 63 64 return items 65 66 def testproxy(self,proxylist): 67 self.log.info(u'开始对获取到的代理服务器进行测试 ...') 68 aliveList = [] 69 ip_list = [] 70 URL = r'http://www.china-yao.com/' 71 regex = re.compile(r'china-yao.com') 72 for proxy in proxylist: 73 if proxy.IP in ip_list: 74 continue #去除列表中重复的代理服务器 75 server = proxy.type.lower() + r'://' + proxy.IP + ':' + proxy.port 76 self.log.info(u'开始测试 %s' % server) 77 opener = urllib2.build_opener(urllib2.ProxyHandler({proxy.type.lower():server})) 78 urllib2.install_opener(opener) 79 try: 80 response = urllib2.urlopen(URL,timeout=3) 81 except: 82 self.log.info(u'%s 连接失败' % server) 83 continue 84 else: 85 try: 86 string = response.read() 87 except: 88 self.log.info(u'%s 连接失败' % server) 89 continue 90 if regex.search(string): 91 self.log.info(u'%s 连接成功 .......' % server) 92 ip_list.append(proxy.IP) 93 aliveList.append(proxy) 94 return aliveList 95 96 97 def pipelines(self,alivelist): 98 filename = 'proxylist.csv' 99 self.log.info(u'准备将获取到的代理服务器地址保存数据到csv文件中...') 100 writer = csv.writer(file(filename,'wb')) 101 #writer.writerow([u'IP地址'.encode('utf8'),u'端口'.encode('utf8'),u'类型'.encode('utf8'),u'地址'.encode('utf8')]) 102 for aliveproxy in alivelist: 103 writer.writerow([aliveproxy.IP.encode('utf8'),aliveproxy.port.encode('utf8'),aliveproxy.type.encode('utf8'),aliveproxy.address.encode('utf8')]) 104 self.log.info(u'数据保存完毕!') 105 106 if __name__ == '__main__': 107 Get_proxy() 108
View Code
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:利用爬虫获取网上医院药品价格信息 (下) - Python技术站