腾讯视频信息数据爬取程序代码【笔记】

  

# -*- coding: utf-8 -*-
import scrapy
from ..items import TencentItem,CommentItem
import re,requests,json
 
 
class TencentSpiderSpider(scrapy.Spider):
    name = 'tencent_spider'
    allowed_domains = ['v.qq.com']
    start_urls = ['https://v.qq.com/x/list/movie']
 
    def parse(self, response):
        category_part = response.xpath('//div[@class="mod_row_filter"]/ul/li/a/@href').extract()
        for href in category_part:
            detail_url='https://v.qq.com/x/list/movie{}'.format(href)
            yield scrapy.Request(url=detail_url,
                                 callback=self.detail_parse
            )
    def detail_parse(self,response):
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 ' \
                          'Firefox/53.0'}
        #分类后的电影信息
        movie_links=response.xpath('//div[@class="mod_figures mod_figure_v"]/ul/li/a/@href').extract()
        movie_titles=response.xpath('//div[@class="figure_title_score"]/strong/a/text()').extract()
        movie_scores=response.xpath('//div[@class="figure_score"]//text()').extract()
        score_list=[]
        total_score=[]
        #得到处理后的评分列表
        for movie_score in movie_scores:
            if movie_score !='\n\t\t\t\t\t\t\t' and movie_score!='\n\t\t\t\t\t\t':
                score_list.append(movie_score)
        #print(score_list)
        j=0
        while j in range(0,len(score_list)-1):
            score=score_list[j]+score_list[j+1]
            j += 2
            total_score.append(score)
        #print(total_score)
        movie_playCounts=response.xpath('//div[@class="figure_count"]/span/text()').extract()#播放量
        movie_account=response.xpath('//span[@class="option_txt"]/em/text()').extract_first('')#个数
        #进入电影详情页
        for x in range(0,len(movie_links)):
            #获取电影链接中的cid例如中括号的内容https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery(19109829145422060698_1517407245638)&op=3&【cid=b5i4g9z3u5h31jy】
            #然后接合GET请求中的评论页的json链接获取json数据中的comment_id,然后拼接评论页url,获取评论内容
            cid=movie_links[x].split('/')[-1]#获取cid
            cid=cid.split('.')[0]
            #print(cid)
            #获取comment_id
            comment_id_url='https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery&op=3&cid={}'.format(cid)
            html=requests.get(comment_id_url).text
            pattern=re.compile(r'comment_id":"(.*?)"')
            comment_id=re.search(pattern,html).group(1)
            #print(comment_id)
            #获取评论页内容
            comment_url='http://coral.qq.com/article/{}/comment/'.format(comment_id)
            comment_html=requests.get(comment_url,headers=headers).text
            dict=json.loads(comment_html)#获得json数据,并通过解析取出需要数据
            data_dict = dict['data']
            commentid_list = data_dict['commentid']
            if commentid_list:#电影有评论
                for detail in commentid_list:
                    comment =CommentItem()
                    comment['movie_title'] = movie_titles[x]#电影名
                    comment['timeDifference'] = detail['timeDifference']# 发布时间
                    comment['content'] = detail['content']# 内容
                    comment['up'] = detail['up']# 点赞
                    comment['rep'] = detail['rep']# 踩
                    userinfo_dict = detail['userinfo']# 用户信息(字典)
                    userid = userinfo_dict['userid']
                    comment['userid']=userid# 用户id
                    comment['userLink']='http://video.coral.qq.com/review/user/{}'.format(userid)#用户链接
                    yield comment
 
            yield  scrapy.Request(url=movie_links[x],
                                callback=self.movie_parse,
                                 meta={'movie_link':movie_links[x],
                                     'movie_title':movie_titles[x],
                                       'score':total_score[x],
                                       'movie_playCount':movie_playCounts[x],
                                       'movie_account':movie_account}
            )
        # 下一页
        next_pg = response.xpath('//a[@class="page_next"]/@href').extract_first('')
        print(next_pg)
        if next_pg:
            next_url = 'https://v.qq.com/x/list/movie{}'.format(next_pg)
            yield scrapy.Request(url=next_url,
                                 callback=self.detail_parse
                                 )
    def movie_parse(self,response):
 
       #简介区
       abstract=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[2]/p/text('
                                ')').extract_first('')
       directors=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//text()').extract()
       director_links = response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//@href').extract()
       if directors:#存在导演信息
            director=directors[0]
            act=','.join(directors[1:])
            director_link=director_links[0]
            act_link=','.join(director_links[1:])
       else:
           director ='#'
           act = '#'
           director_link = '#'
           act_link = '#'
       #概览区
       movie_title=response.meta['movie_title']
       score=response.meta['score']
       movie_playCount=response.meta['movie_playCount']
       movie_account=response.meta['movie_account']
       movie_link=response.meta['movie_link']
 
       movie=TencentItem()
        #简介
       movie['abstract']=abstract
       movie['director']=director
       movie['act']=act
       movie['director_link']=director_link
       movie['act_link']=act_link
        #概览
       movie['movie_title']=movie_title
       movie['score']=score
       movie['movie_playCount']=movie_playCount
       movie['movie_link']=movie_link
       movie['movie_account']=movie_account
       yield movie