"""古诗文爬取"""
import requests
import re


def parse_page(url):
    rep = requests.get(
        url=url,
        headers={"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"}
    )
    text = rep.text
    # re正则匹配古诗文标题
    titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
    # re正则匹配古诗文朝代
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text)
    # re正则匹配古诗文作者
    authors = re.findall(r'<p class="source">.*?<a.*?><a.*?>(.*?)</a>', text, re.DOTALL)
    # re正则匹配古诗文内容
    content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
    contents = []
    # 清除诗文内容br标签
    for content in content_tags:
        data = re.sub(r"<.*?>", "", content)
        contents.append(data.strip())
    poems = []
    # zip参数可放置一个或多个迭代器,并把对应的元素打包成元组
    for value in zip(titles, dynasties, authors, contents):
        title, dynastie, author, content = value
        poem = {
            "title": title,
            "dynastie": dynastie,
            "author": author,
            "content": content
        }
        poems.append(poem)
    print(poems)


def main():
    for x in range(1, 101):
        url = "https://www.gushiwen.org/default_{}.aspx".format(x)
        parse_page(url)


if __name__ == '__main__':
    main()

View Code