node爬虫代理设置
最近想爬取YouTube上面的视频信息,利用nodejs爬虫笔记(一)的方法,代码和错误如下
var request = require(‘request’); var cheerio = require(‘cheerio’);**** var url = ‘https://www.youtube.com ‘; function crawler(url,callback){ var list = []; request(url,function(err,res){ if(err){ return callback(err); } var $ = cheerio(res.body.toString()); var s = $(’*’).text(); console.log(‘s=’+s); }); callback(null,list); }; crawler(url,function(err,list){ if(err){ return console.log(err); } console.log(list); });
错误
{ [Error: connect ETIMEDOUT 8.7.198.45:443 ] code: ‘ETIMEDOUT’, errno: ‘ETIMEDOUT’, syscall: ‘connect’, address: '8.7.198.45', port: 443 }
由于国内访问youtube的时候需要FQ,而在代码里我们需要通过设置代理才能获取页面信息。
1、通过nodejs的http/https模块
具体使用可以参考http://nodejs.cn/api/http.html#http_http_request_options_callback,我使用的lantern作为FQ工具。
var http = require('http'); // 使用http模块,也可以换成https模块 var opt = { host: '127.0.0.1', // 这里是代理服务器的地址 port: '57939', // 这里是代理服务器的端口号 method: 'GET', // 这里是发送的方法 path: 'https://www.youtube.com', // 这里是访问的路径 headers: {
//请求头(可以利用Google浏览器打开youtube首页,点击network 查看请求头,把相关信息复制过来即可)
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch, br',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
'Cache-Control':'max-age=0',
'Cookie':'_ga=GA1.2.1653214693.1476773935; VISITOR_INFO1_LIVE=T3BczuPUIQo; SID=5QR6XEldVgveXzFtqjIcD480cHE18gBRd3xPo398vndcc5JNxOAZ-TgVp5jQx 3CR-ePvgA.; HSID=APr2I8UwM-A-Lypbd; SSID=Ap4H3Td1nrV__-9tN; APISID=8bHyFV90pNBU5Z9p/A2DlJa2MyJLL4-RKP; SAPISID=4tZf4GDX7Dt5bNAt/A5vhaZe_DLzn -ECul; CONSENT=YES+CN.zh-CN+20160904-14-0; YSC=XVHk_pArWhE; PREF=cvdm=grid&f1=50000000&f6=1&f5=30&al=zh-CN&gl=HK',
'Upgrade-insecure-requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36',
'X-Chrome-Uma-Enabled':'1',
'X-Client-Data':'CJa2yQEIorbJAQjBtskBCKmdygE=',
'Connection': 'keep-alive'
}
}; var body = ''; var req = http.request(opt, function(res) { console.log("Got response: " + res.statusCode); res.on('data', function(d) { body += d; }).on('end', function() { //console.log(res); console.info('============'); console.log(body) }); }).on('error', function(e) { console.log("Got error: " + e.message); }) req.end();
2、使用SuperAgent以及superagent-proxy模块
为了使用方便以及加快开发的速度,我们就会引入模块。SuperAgent也是一个可以封装好的http模块,功能和Request模块也差不多。如果要使用代理模块的还,还需要额外的拓展模块SuperAgent-Proxy。
SuperAgent官网地址
SuperAgent-proxy官网地址
var request =require('superagent'); require('superagent-proxy')(request); var fs = require('fs'); var proxy = 'http://127.0.0.1:57939'; var header = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Cache-Control':'max-age=0', 'Cookie':'_ga=GA1.2.1653214693.1476773935; VISITOR_INFO1_LIVE=T3BczuPUIQo; SID=5QR6XEldVgveXzFtqjIcD480cHE18gBRd3xPo398vndcc5JNxOAZ-TgVp5jQx3CR-ePvgA.; HSID=APr2I8UwM-A-Lypbd; SSID=Ap4H3Td1nrV__-9tN; APISID=8bHyFV90pNBU5Z9p/A2DlJa2MyJLL4-RKP; SAPISID=4tZf4GDX7Dt5bNAt/A5vhaZe_DLzn-ECul; CONSENT=YES+CN.zh-CN+20160904-14-0; YSC=XVHk_pArWhE; PREF=cvdm=grid&f1=50000000&f6=1&f5=30&al=zh-CN&gl=HK', 'Upgrade-insecure-requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36', 'X-Chrome-Uma-Enabled':'1', 'X-Client-Data':'CJa2yQEIorbJAQjBtskBCKmdygE=', 'Connection': 'keep-alive' }; request .get('https://www.youtube.com') .set('header',header) .proxy(proxy) .end(onresponse); function onresponse(err,res){ res.setEncoding('utf-8'); //防止中文乱码 if(err){ console.log(err); }else{ console.log('status:'+res.status); //console.log(res.headers); console.log(res.text); //将res.text写入json文件 fs.writeFile(__dirname+'/data/home.json',JSON.stringify({ status: 0, data: res.text }),function(err){ if(err){ return console.log(err); } console.log('完成'); }); } }
运行后会出现以下结果:
status:200
-sessionlink="itct=CJUBEJQ1GAEiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4yCmctaGlnaC10cnZaD0ZFd2hhdF90b193YXRjaA" title="欅って、書けない 2017年7月9日 #87" aria-describedby="description-id-25113" dir="ltr">欅って、書けない 2017年7月9日 #87</a><span class="accessible-description" >> <img width="196" alt="" data-ytimg="1" src="https://i.ytimg.com/vi/6OwlCt4aKfM/hqdefault.jpg?sqp=-oaymwEWCMQBEG5IWvKriqkDCQgBFQAAiEIYAQ==&rs=AOn4CLAwVQhAiFdPT0nRTMFB8rX989yXuA" height="110" onload=";window.__ytRIL && __ytRIL(this)" > <span class="video-time" aria-hidden="true">6:44</span></span></div></a> <span class="thumb-menu dark-overflow-action-menu video-actions"> <button aria-haspopup="true" aria-expanded="false" onclick=";return false;" class="yt-uix-button-reverse flip addto-watch-queue-menu spf-nolink hide-until-delayloaded yt-uix-button yt-uix-button-dark-overflow-action-menu yt-uix-button-size-default yt-uix-button-has-icon no-icon-markup yt-uix-button-empty" type="button" ><span class="yt-uix-button-arrow yt-sprite"></span><ul class="watch-queue-thumb-menu yt-uix-button-menu yt-uix-button-menu-dark-overflow-action-menu hid"><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-next yt-uix-button-menu-item" data-action="play-next" onclick=";return false;" data-video-ids="6OwlCt4aKfM"><span class="addto-watch-queue-menu-text">次に再生</span></li><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-now yt-uix-button-menu-item" data-action="play-now" onclick=";return false;" data-video-ids="6OwlCt4aKfM"><span class="addto-watch-queue-menu-text">今すぐ再生</span></li></ul></button> </span> <button class="yt-uix-button yt-uix-button-size-small yt-uix-button-default yt-uix-button-empty yt-uix-button-has-icon no-icon-markup addto-button video-actions spf-nolink hide-until-delayloaded addto-watch-later-button-sign-in yt-uix-tooltip" type="button" onclick=";return false;" title="後で見る" role="button" data-button-menu-></span></button> <button class="yt-uix-button yt-uix-button-size-small yt-uix-button-default yt-uix-button-empty yt-uix-button-has-icon no-icon-markup addto-button addto-queue-button video-actions spf-nolink hide-until-delayloaded addto-tv-queue-button yt-uix-tooltip" type="button" onclick=";return false;" title="キュー" data-style="tv-queue" data-video-ids="6OwlCt4aKfM"></button> </div><div class="yt-lockup-content"><h3 class="yt-lockup-title "><a href="/watch?v=6OwlCt4aKfM" class=" yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link " data-sessionlink="itct=CJQBEJQ1GAIiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4yCmctaGlnaC10cnZaD0ZFd2hhdF90b193YXRjaA" title="シャンプー中に血のり垂らしたらとんでもないパニックになったwww" aria-describedby="description-id-467161" dir="ltr">シャンプー中に血のり垂らしたらとんでもないパニックになったwww</a><span class="accessible-description" > <img width="196" alt="" data-ytimg="1" src="https://i.ytimg.com/vi/t-saIu9AD58/hqdefault.jpg?sqp=-oaymwEWCMQBEG5IWvKriqkDCQgBFQAAiEIYAQ==&rs=AOn4CLCICoGcnfWiQ4rKmXIbmq1fspGKiA" height="110" onload=";window.__ytRIL && __ytRIL(this)" > <span class="video-time" aria-hidden="true">8:40</span></span></div></a> <span class="thumb-menu dark-overflow-action-menu video-actions"> <button aria-haspopup="true" aria-expanded="false" onclick=";return false;" class="yt-uix-button-reverse flip addto-watch-queue-menu spf-nolink hide-until-delayloaded yt-uix-button yt-uix-button-dark-overflow-action-menu yt-uix-button-size-default yt-uix-button-has-icon no-icon-markup yt-uix-button-empty" type="button" ><span class="yt-uix-button-arrow yt-sprite"></span><ul class="watch-queue-thumb-menu yt-uix-button-menu yt-uix-button-menu-dark-overflow-action-menu hid"><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-next yt-uix-button-menu-item" data-action="play-next" onclick=";return false;" data-video-ids="t-saIu9AD58"><span class="addto-watch-queue-menu-text">次に再生</span></li><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-now yt-uix-button-menu-item" data-action="play-now" onclick=";return false;" data-video-ids="t-saIu9AD58"><span class="addto-watch-queue-menu-text">今すぐ再生</span></li></ul></button> </span> .......太长了就省略了
此时我们已经可以成功连接上YouTube,接下来就可以利用cheerio模块对其进行解析啦。
请求头的获取:
代码中的请求头可以利用浏览器,FQ后打开YouTube首页,点击检查,然后点击network,刷新后我们会发现header里面包含的请求信息。
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:nodejs爬虫笔记(二)—代理设置 - Python技术站