Python 爬虫

1. 配置第三方包

# 时间的模块
import datetime
# 数据分析模块，用来处理excel
import pandas as pd
#用来构造xlsx文件的模块
import xlsxwriter as xlw
# 用来爬取数据的模块
from urllib import request
# HTML或XML标签中的内容解析器
from bs4 import BeautifulSoup as bs

2.获取时间序列函数

# 产生时间序列
def dateRange1(start, end):
    datelist1 = [datetime.datetime.strftime(x, '%Y%m') for x in list(
        pd.date_range(start=start, end=end))]
    datelist = sorted(list(set(datelist1)))
    return datelist
# ['202005', '202006', '202007', '202008', '202009', '202010']

3.爬取网页

# 爬取网页数据，解析HTML文件，筛选数据，转换成列表格式数据
def getCommentsById(city, start, end): 
    weather_result = [] 
    # 获取时间序列 
    datelist = dateRange1(start, end)  # [ '202009', '202010']
    for i in datelist:
        url = 'http://lishi.tianqi.com/' + city + '/' + i + '.html'
        # 请求天气数据
        opener = request.Request(url)
        # 添加  HTTP请求头
        opener.add_header(
            'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        req = request.urlopen(opener).read()
        # 解析html 数据
        soup = bs(req, 'html.parser')
        
        # 'div .thrui > li ' 筛选html数据 
        weather_m = soup.select('div .thrui > li ')
        # 循环获取的数据  
        for i in weather_m[0:]:  
            tt = []
            for j in range(5):
               t = i.find_all('div')[j].string
               if t is not None:  # 存在None值的进行处理，否则不能写入到excel
                    tt.append(t)
               else:
                    tt.append('None')
            weather_result.append(tt)
            print(weather_result)
    return weather_result

4.输出excel文件

#  将list数据写入到本地excel中
def list_to_excel(weather_result, filename):
    # 创建excel 名称，路径
    workbook = xlw.Workbook('E:\\%s.xlsx' % filename)
    # 添加工作簿
    sheet = workbook.add_worksheet('weather_report')
    # 添加excel头标题文字
    title = ['日期', '最高气温', '最低气温', '天气', '风向',]
    for i in range(len(title)):
        # 将标题文字写入excel表头，字体加粗
        sheet.write_string(0, i, title[i], workbook.add_format({'bold': True}))  
    row, col = 1, 0
    for a, b, c, d , e in weather_result:
        # 依次将数据 写入表格
        sheet.write_string(row, col, a)
        sheet.write_string(row, col + 1, b)
        sheet.write_string(row, col + 2, c)
        sheet.write_string(row, col + 3, d)
        sheet.write_string(row, col + 4, d)
        row += 1
        # 关闭表格
    workbook.close()

5.调用

# 你要查询的城市的名称（拼音），起始时间，结束时间。
data = getCommentsById('hunan', '2020-09', '2020-10')

# 获取的data值，excel的文件名
list_to_excel(data, '湖南天气202009-202010')

全部源码

# 时间的模块
import datetime
# 数据分析模块，用来处理excel
import pandas as pd
#用来构造xlsx文件的模块
import xlsxwriter as xlw
# 用来爬取数据的模块
from urllib import request
# HTML或XML标签中的内容解析器
from bs4 import BeautifulSoup as bs

# 产生时间序列
def dateRange1(start, end):
    datelist1 = [datetime.datetime.strftime(x, '%Y%m') for x in list(
        pd.date_range(start=start, end=end))]
    datelist = sorted(list(set(datelist1)))
    return datelist
# ['202005', '202006', '202007', '202008', '202009', '202010']


# 爬取网页数据，解析HTML文件，筛选数据，转换成列表格式数据
def getCommentsById(city, start, end): 
    weather_result = [] 
    # 获取时间序列 
    datelist = dateRange1(start, end)  # [ '202009', '202010']
    for i in datelist:
        url = 'http://lishi.tianqi.com/' + city + '/' + i + '.html'
        # 请求天气数据
        opener = request.Request(url)
        # 添加  HTTP请求头
        opener.add_header(
            'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        req = request.urlopen(opener).read()
        # 解析html 数据
        soup = bs(req, 'html.parser')
        
        # 'div .thrui > li ' 筛选html数据 
        weather_m = soup.select('div .thrui > li ')
        # 循环获取的数据  
        for i in weather_m[0:]:  
            tt = []
            for j in range(5):
               t = i.find_all('div')[j].string
               if t is not None:  # 存在None值的进行处理，否则不能写入到excel
                    tt.append(t)
               else:
                    tt.append('None')
            weather_result.append(tt)
            print(weather_result)
    return weather_result

#  将list数据写入到本地excel中
def list_to_excel(weather_result, filename):
    # 创建excel 名称，路径
    workbook = xlw.Workbook('E:\\%s.xlsx' % filename)
    # 添加工作簿
    sheet = workbook.add_worksheet('weather_report')
    # 添加excel头标题文字
    title = ['日期', '最高气温', '最低气温', '天气', '风向',]
    for i in range(len(title)):
        # 将标题文字写入excel表头，字体加粗
        sheet.write_string(0, i, title[i], workbook.add_format({'bold': True}))  
    row, col = 1, 0
    for a, b, c, d , e in weather_result:
        # 依次将数据 写入表格
        sheet.write_string(row, col, a)
        sheet.write_string(row, col + 1, b)
        sheet.write_string(row, col + 2, c)
        sheet.write_string(row, col + 3, d)
        sheet.write_string(row, col + 4, d)
        row += 1
        # 关闭表格
    workbook.close()




# 你要查询的城市的名称（拼音），起始时间，结束时间。
data = getCommentsById('hunan', '2020-09', '2020-10')

# 获取的data值，excel的文件名
list_to_excel(data, '湖南天气202009-202010')

本站文章如无特殊说明，均为本站原创，如若转载，请注明出处：Python 爬虫 - Python技术站

相关文章