import re from urllib.request import urlopen def getPage(url): response=urlopen(url) return response.read().decode('gbk',errors='ignore') def parsePage(s): com=re.compile(r'<td height="26">.*?<b>.*?<a href="(?P<url_name>.*?)" class="ulink">.*?',re.S) ret=com.finditer(s) for i in ret : return "http://www.dytt8.net"+i.group("url_name") def parsePage1(s): com=re.compile(r'<div >'+ '◎主.*?演(?P<zhuyan>.*?)<br /><br />◎简.*?介.*?<td.*?><a href="(?P<xiazaidizhi>.*?)">',re.S) ret1=com.finditer(s) # print('****************************************************************') for i in ret1 : yield {"yiming":(re.sub("[\u3000]", "",i.group('name'))), "pianming":re.sub("[\u3000]", "",i.group("pianname")), "daoyan":re.sub("[\u3000]", "",i.group("daoyan")), "zhuyan":re.sub("[\u3000]", "",i.group("zhuyan")), "xiazaidizhi":re.sub("[\u3000]", "",i.group("xiazaidizhi"))} def main(num): url="http://www.dytt8.net/html/gndy/dyzz/list_23_%s.html" % num response_html=getPage(url) xiangqing=parsePage(response_html) response1_html = getPage(xiangqing) ret=parsePage1(response1_html) f = open("move_list", "a", encoding="utf8") for obj in ret: print(obj) data = str(obj) f.write(data + "\n") for i in range(1,181): main(i)
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:爬虫初识(爬取dytt电影列表及下载地址) - Python技术站