# -*- coding: utf-8 -*-
"""
Created on Sat Dec 18 00:00:59 2021
@author: Hider
"""
import requests
import parsel
import time
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'Connection': 'close'
}
def get_page(page):
url = 'https://www.kuaidaili.com/free/inha/' + str(page)
response = requests.get(url=url, headers=headers)
html = parsel.Selector(response.text)
parse_page(html)
def parse_page(html):
parse_list = html.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
for tr in parse_list:
# parse_lists = {}
ip = tr.xpath('./td[@data-title="IP"]//text()').extract_first()
port = tr.xpath('./td[@data-title="PORT"]//text()').extract_first()
nimingdu = tr.xpath('./td[@data-title="匿名度"]//text()').extract_first()
type1 = tr.xpath('./td[@data-title="类型"]//text()').extract_first()
location = tr.xpath('./td[@data-title="位置"]//text()').extract_first()
speed = tr.xpath('./td[@data-title="响应速度"]//text()').extract_first()
last_time = tr.xpath('./td[@data-title="最后验证时间"]//text()').extract_first()
# parse_lists[http] = num + ':' + port
parse_lists.append([ip, port, nimingdu, type1, location, speed, last_time])
time.sleep(0.1)
# print(parse_lists)
if __name__ == '__main__':
parse_lists = []
for page in range(1, 21):
get_page(page)
df = pd.DataFrame(parse_lists, columns=['IP','PORT','匿名度','类型','位置','响应速度','最后验证时间'])
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:爬虫学习笔记:打造自己的代理池 - Python技术站