腾讯视频信息数据爬取程序代码【笔记】
# -*- coding: utf-8 -*- import scrapy from ..items import TencentItem,CommentItem import re,requests,json class TencentSpiderSpider(scrapy.Spider): name = 'tencent_spider' allowed_domains = ['v.qq.com'] start_urls = ['https://v.qq.com/x/list/movie'] def parse(self, response): category_part = response.xpath('//div[@class="mod_row_filter"]/ul/li/a/@href').extract() for href in category_part: detail_url='https://v.qq.com/x/list/movie{}'.format(href) yield scrapy.Request(url=detail_url, callback=self.detail_parse ) def detail_parse(self,response): headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 ' \ 'Firefox/53.0'} #分类后的电影信息 movie_links=response.xpath('//div[@class="mod_figures mod_figure_v"]/ul/li/a/@href').extract() movie_titles=response.xpath('//div[@class="figure_title_score"]/strong/a/text()').extract() movie_scores=response.xpath('//div[@class="figure_score"]//text()').extract() score_list=[] total_score=[] #得到处理后的评分列表 for movie_score in movie_scores: if movie_score !='\n\t\t\t\t\t\t\t' and movie_score!='\n\t\t\t\t\t\t': score_list.append(movie_score) #print(score_list) j=0 while j in range(0,len(score_list)-1): score=score_list[j]+score_list[j+1] j += 2 total_score.append(score) #print(total_score) movie_playCounts=response.xpath('//div[@class="figure_count"]/span/text()').extract()#播放量 movie_account=response.xpath('//span[@class="option_txt"]/em/text()').extract_first('')#个数 #进入电影详情页 for x in range(0,len(movie_links)): #获取电影链接中的cid例如中括号的内容https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery(19109829145422060698_1517407245638)&op=3&【cid=b5i4g9z3u5h31jy】 #然后接合GET请求中的评论页的json链接获取json数据中的comment_id,然后拼接评论页url,获取评论内容 cid=movie_links[x].split('/')[-1]#获取cid cid=cid.split('.')[0] #print(cid) #获取comment_id comment_id_url='https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery&op=3&cid={}'.format(cid) html=requests.get(comment_id_url).text pattern=re.compile(r'comment_id":"(.*?)"') comment_id=re.search(pattern,html).group(1) #print(comment_id) #获取评论页内容 comment_url='http://coral.qq.com/article/{}/comment/'.format(comment_id) comment_html=requests.get(comment_url,headers=headers).text dict=json.loads(comment_html)#获得json数据,并通过解析取出需要数据 data_dict = dict['data'] commentid_list = data_dict['commentid'] if commentid_list:#电影有评论 for detail in commentid_list: comment =CommentItem() comment['movie_title'] = movie_titles[x]#电影名 comment['timeDifference'] = detail['timeDifference']# 发布时间 comment['content'] = detail['content']# 内容 comment['up'] = detail['up']# 点赞 comment['rep'] = detail['rep']# 踩 userinfo_dict = detail['userinfo']# 用户信息(字典) userid = userinfo_dict['userid'] comment['userid']=userid# 用户id comment['userLink']='http://video.coral.qq.com/review/user/{}'.format(userid)#用户链接 yield comment yield scrapy.Request(url=movie_links[x], callback=self.movie_parse, meta={'movie_link':movie_links[x], 'movie_title':movie_titles[x], 'score':total_score[x], 'movie_playCount':movie_playCounts[x], 'movie_account':movie_account} ) # 下一页 next_pg = response.xpath('//a[@class="page_next"]/@href').extract_first('') print(next_pg) if next_pg: next_url = 'https://v.qq.com/x/list/movie{}'.format(next_pg) yield scrapy.Request(url=next_url, callback=self.detail_parse ) def movie_parse(self,response): #简介区 abstract=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[2]/p/text(' ')').extract_first('') directors=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//text()').extract() director_links = response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//@href').extract() if directors:#存在导演信息 director=directors[0] act=','.join(directors[1:]) director_link=director_links[0] act_link=','.join(director_links[1:]) else: director ='#' act = '#' director_link = '#' act_link = '#' #概览区 movie_title=response.meta['movie_title'] score=response.meta['score'] movie_playCount=response.meta['movie_playCount'] movie_account=response.meta['movie_account'] movie_link=response.meta['movie_link'] movie=TencentItem() #简介 movie['abstract']=abstract movie['director']=director movie['act']=act movie['director_link']=director_link movie['act_link']=act_link #概览 movie['movie_title']=movie_title movie['score']=score movie['movie_playCount']=movie_playCount movie['movie_link']=movie_link movie['movie_account']=movie_account yield movie
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:腾讯视频信息数据爬虫开发【核心爬虫代码】 - Python技术站