# -*- coding: UTF-8 -*- import re from bs4 import BeautifulSoup import requests import codecs import sys reload(sys) sys.setdefaultencoding('utf8') def mei_url(): url = 'http://mdl.com/product' web_data = requests.get(url) web_data.encoding = 'utf-8' soup = BeautifulSoup(web_data.text, 'lxml') return soup def mei_info(sub_url='/product/item/293410'): url = 'http://mdl.com'+sub_url web_data = requests.get(url) web_data.encoding = 'utf-8' soup = BeautifulSoup(web_data.text, 'lxml') title=soup.select('#main > div.boundary > div > div.container__main > div.section.section-info.clearfix > h2')[0].get_text() introduce=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[0].get_text() effect=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text > span')[0].get_text() crowd=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[2].get_text() print title with codecs.open(r'E:\note\mei_infov3.txt', "a+",'utf8') as file: file.write('&'.join(map(lambda x:str(x),[title,introduce,effect,crowd]))) file.write('\n') file.write('$') if __name__=='__main__': # items=mei_url() # items=str(items) soup1 = BeautifulSoup(open(r'E:\note\mei.htm'),'lxml') items1=str(soup1) url_list1=re.findall(r'/product/item/\d{6}',items1 ) soup2 = BeautifulSoup(open(r'E:\note\mei2.htm'),'lxml') items2=str(soup2) url_list2=re.findall(r'/product/item/\d{6}',items2 ) url_list3=url_list1+url_list2 print len(url_list3) for sub_url in url_list3: mei_info(sub_url)
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python爬虫之BeautifulSoup - Python技术站