#通过登录去爬虫 #首先要有用户名和密码 import urllib.request import http.cookiejar from lxml import etree head = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' } # 给opener加上cookie def makeMyOpener(head): cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) opener.addheaders = header return opener # 爬自己的页面 oper = makeMyOpener(head) uop = oper.open('http://127.0.0.1:8000/index/loginHtml/', timeout = 1000) data = uop.read() html = data.decode() # lxml提取 csrfmiddlewaretoken
selector = etree.HTML(html) links = selector.xpath('//form/input[@name="csrfmiddlewaretoken"]/@value') for link in links: csrfmiddlewaretoken = link print(link) url = 'http://127.0.0.1:8000/index/login/' datas = {'csrfmiddlewaretoken':csrfmiddlewaretoken,'email':'aa','pwd':'aa'}
# 必须要把字符串改为二进制流 data_encoded = urllib.parse.urlencode(datas).encode(encoding='utf-8') response = oper.open(url,data_encoded) content = response.read() html = content.decode() print(html)
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:爬虫day 04(通过登录去爬虫 解决django的csrf_token) - Python技术站