""" 请求连接:https://maoyan.com/board/4 第二页:https://maoyan.com/board/4?offset=10 """ import requests import re class myspider(): def __init__(self,base_url,headers): self.base_url = base_url self.headers = headers #获取第一页数据 def get_data(self,start_num): url = self.base_url.format(start_num) response = requests.get(url = url,headers = self.headers) #判断状态码 if response.status_code == 200: return response.content.decode('utf8') else: return None #解析数据 def parse_onepage(self,html): pattern = re.compile('<dd>.*?board-index.*?>(\d+).*?movie-item-info.*?>.*?<a.*?title="(.*?)".*?>.*?</dd>',re.S) result = re.findall(pattern,html) return result #保存数据 def save_data(self,data): for value in data: list1 = [] for valuedate in value: list1.append(valuedate) #列表拼接成字符串 movestr = " ".join(list1)+'\n' with open('./movestr.txt','a',encoding='utf-8') as f: f.write(movestr) if __name__ == "__main__": #连接参数 base_url = "https://maoyan.com/board/4?offset={}" #请求头 headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } my_spider = myspider(base_url, headers) html = my_spider.get_data(0) value = my_spider.parse_onepage(html) my_spider.save_data(value)
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python简单爬虫 - Python技术站