本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python爬虫输入标题百度百科获取内容 - Python技术站
python爬虫输入标题百度百科获取内容
•
爬虫
##原始诉求,经过标题获取内容翻译英文再翻译中文,提高原创度
import requests
import re
from lxml import etree
from translate import Translator
import urllib.request
import urllib.parse
import json
def first_url(url):
send_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
"Connection": "keep-alive",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8"
}
res = requests.get(url,send_headers)
res.close()
res.encoding= 'utf-8'
html=res.text
return html
#获取第一个百度经验内容下p标签的文本内容
def get_result(html):
title=[]
datahtml=etree.HTML(html)
html_data=datahtml.xpath('/html/body/section/div/div/article/div/div/div/ol/li/div/p')
print(type(html_data))
finalre=''
for i in html_data:
title.append(i.text)
#print(i.text)
finalre=''.join(title)
#print(finalre)
f=open('f.txt','w',encoding='utf-8')
f.write(finalre)
f.close()
#中文转英文
def youdao_translate(content):
'''实现有道翻译的接口'''
youdao_url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {}
data['i']= content
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt'] = '1525141473246'
data['sign'] = '47ee728a4465ef98ac06510bf67f3023'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'
data['typoResult'] = 'false'
data = urllib.parse.urlencode(data).encode('utf-8')
youdao_response = urllib.request.urlopen(youdao_url, data)
youdao_html = youdao_response.read().decode('utf-8')
target = json.loads(youdao_html)
trans = target['translateResult']
ret = ''
for i in range(len(trans)):
line = ''
for j in range(len(trans[i])):
line = trans[i][j]['tgt']
ret += line + '\n'
#print(ret)
return ret
if __name__ == "__main__":
searce_value='为什么流鼻血'
url="http://jingyan.baidu.com/search?word=%s"%searce_value
first_html_url=first_url(url) #首次获取url结果内容
second_url=eval(re.findall(r'"/article/.*.html"',first_html_url)[0]) #选择第一个百度经验链接
second_resul_url="http://jingyan.baidu.com%s"%second_url
se_html_resu=first_url(second_resul_url) #第一个百度经验链接获取内容
get_result(se_html_resu) #进行获取p标签的内容打印输出
fif=open('f.txt','r',encoding='utf-8')
linelist=fif.read().split('。')
#print(linelist)
for line in linelist:
if line.find('?')>0:
#print('-----',line)
linli=line.split('?')
for l in linli:
if len(l) > 0:
fi=youdao_translate(l)
zhfi=youdao_translate(fi)
print(zhfi)
if len(line) > 0:
print(line)
fi=youdao_translate(line)
zhfi=youdao_translate(fi)
print(zhfi)
赞 (0)
猫眼100 爬虫
上一篇
2023年4月16日
爬虫-request(3)
下一篇
2023年4月16日