一个关于豆瓣影评的爬虫,涉及:模拟登陆,翻页抓取。直接上代码:
import re import time import requests import xlsxwriter from bs4 import BeautifulSoup headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36', 'Referer':'https://www.douban.com/accounts/login?source=movie'} s = requests.Session() def log_in(login_url): # 获取验证码并保存到本地 imgdata = s.get("https://www.douban.com/accounts/login?source=movie", headers=headers, verify=False).text print(imgdata) pa = re.compile(r'<img />') img_url = re.findall(pa, imgdata)[0] print(img_url) picdata = s.get(img_url).content with open("douban.jpg", 'wb') as f: f.write(picdata) # 获取随机ID pa_id = re.compile(r'<input type="hidden" name="captcha-id" value="(.*?)"/>') capid = re.findall(pa_id, imgdata)[0] print(capid) capimg = input("输入验证码:") payload = { "source":"movie", "redir":"https://movie.douban.com/", "form_email":"你的邮箱", "form_password":"你的密码", "captcha-solution":capimg, "captcha-id":capid, "login":"登录" } # log_url = "https://accounts.douban.com/login" data1 = s.post(login_url, data=payload, verify=False) # 绕过了SSL验证 print(data1.status_code) i = 0 def get_data(url): time.sleep(2) print("#"*50) global i print(i) try: data = s.get(url, headers = headers).text print(data) except: try: time.sleep(3) print("正在尝试重新加载页面...") data = s.get(url, headers= headers).text except: workbook.close() pass # print(data) # 解析网页 soup = BeautifulSoup(data, "lxml") comments = soup.findAll("div", {"class":"comment-item"}) # print(len(comments)) for comment in comments: i += 1 info = comment.find("span",{"class":"comment-info"}) # get date date = info.find("span",{"class":""}).get_text() pa_date = re.compile("\d\d\d\d-\d\d-\d\d") date = re.findall(pa_date, date)[0] # print(date) worksheet.write(i,0,date) # get star star = info.find("span")["class"][0][-2:-1] # print(star) worksheet.write(i,1,star) # get vote vote = comment.find("span", {"class":"comment-vote"}).find("span").get_text() # print(vote) worksheet.write(i,2,vote) # get content content = comment.find("div", {"class":"comment"}).find("p").get_text() print(content) worksheet.write(i,3,content) # 获取下一页的url,递归抓取 pa = re.compile('<a href="?(.*?)" .*? class="next">后一页</a>') try: next = str(pa.findall(data)[0]).replace("amp;","") next_url = "https://movie.douban.com/subject/25958717/comments" + next print("正在抓取"+next_url+"...") get_data(next_url) except: workbook.close() pass workbook = xlsxwriter.Workbook('海蒂和爷爷影评.xlsx') worksheet = workbook.add_worksheet() worksheet.set_column('A:A', 20) worksheet.set_column('B:B', 10) worksheet.set_column('C:C', 10) worksheet.set_column('D:D', 500) login_url = "https://accounts.douban.com/login" log_in(login_url) comment_data = get_data("https://movie.douban.com/subject/25958717/comments") workbook.close()
这里有两个问题:
1.首先,登陆的时候,可能会不需要验证码(当然也不会抓到验证码的图片。。),加上try就可以了。
2.数据抓取不全。。。总是剩下1/5左右的数据抓不到,,目前还未解决,请看到的大神指点!
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python3爬虫再探之豆瓣影评数据抓取 - Python技术站