import requests from bs4 import BeautifulSoup import pandas as pd import gevent from gevent import monkey;monkey.patch_all() import time import re import random UA_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' ,'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED)','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED)'] proxies_list=[{'proxy': 'http:\\10.220.70.254:808'}, {'proxy': 'http:\\10.221.70.254:808'}, {'proxy': 'http:\\10.222.70.254:808'}, {'proxy': 'http:\\10.223.70.254:808'}] headers = {'User-Agent':random.choice(UA_list),'Referer':'http://b2b.hc360.com/'} def diyu(sheng,shi): for i in range(100): or_url = 'http://s.hc360.com/?w={}&mc=enterprise&ee={}&z=%D6%D0%B9%FA%3A{}%CA%A1%3A{}'.format(sheng,i+1,sheng,shi) res = requests.get(or_url,headers = headers,) soup = BeautifulSoup(res.text,'lxml') urls = soup.select('dd.til > h3 > a') for url in urls: return url.get('href') def url_parser(urld): res = requests.get(urld, headers=headers,proxies=random.choice(proxies_list),timeout=60) if res.status_code !='404': soup = BeautifulSoup(res.text, 'lxml') flag = re.findall(r'公司黄页',str(soup)) if len(flag)>0: return url_HYparer(soup) else: or_url = urld + 'shop/company.html' res = requests.get(or_url, headers=headers,proxies=random.choice(proxies_list),timeout=60) soup1 = BeautifulSoup(res.text, 'lxml') flag1 = re.findall(r'手机极速版',str(soup1)) flag2 = re.findall(r'未认证 ', str(soup1)) if len(flag1)>0: return url_SJJSparer(soup1) elif len(flag2)>0: return url_uncertifie(soup1) else: return url_NSJJSparer(soup1) def url_NSJJSparer(soup): data = { 'conpany_name':soup.select('td.contitlebg > span')[0].text.strip(), 'name':soup.select('span.bluezzbold.font14')[0].text.strip(), 'address':soup.select('td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2)')[0].get('title'), 'phone':re.search(r'\d{11}|\d{4}-\d{8}',str(soup)).group()} return data def url_HYparer(soup): data = { 'conpany_name':soup.select('div.sub-info > h1')[0].text, 'name':soup.select('samp')[0].text, 'address':soup.select('div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(3) > span.conRight')[0].text, 'phone':soup.select('div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(2) > span.conRight')[0].text } return data def url_SJJSparer(soup): data = { 'conpany_name':soup.select('div.ContacCon1 > h3')[0].text.strip(), 'name':soup.select('div.ContactsName > span > a')[0].text.strip(), 'address':soup.select('div.ContacCon3 > ul > li:nth-of-type(1) > div.con3Rig')[0].text.strip(), 'phone':re.search(r'\d{11}|\d{4}-\d{8}',str(soup)).group()} return data def url_uncertifie(soup): data = { 'conpany_name':soup.select('td.contitlebg_1 > span')[0].text.strip(), 'name':soup.select('span.bluezzbold.font14')[0].text.strip(), 'address':soup.select('td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2)')[0].text.strip(), 'phone':re.search(r'\d{11}|\d{4}-\d{8}',str(soup)).group()} return data if __name__=='__main__': with open('uu.txt', 'r') as f: info_total = [] for i in f: try: info_ary = url_parser(i.strip()) time.sleep(random.randint(1,5)) info_total.append(info_ary) print(len(info_total)) except Exception as e: print(e, i.strip()) df = pd.DataFrame(info_total) df.to_excel('huicong_beijing.xlsx') print('Done')
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:慧聪网爬虫 - Python技术站