"""古诗文爬取""" import requests import re def parse_page(url): rep = requests.get( url=url, headers={"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"} ) text = rep.text # re正则匹配古诗文标题 titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL) # re正则匹配古诗文朝代 dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text) # re正则匹配古诗文作者 authors = re.findall(r'<p class="source">.*?<a.*?><a.*?>(.*?)</a>', text, re.DOTALL) # re正则匹配古诗文内容 content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL) contents = [] # 清除诗文内容br标签 for content in content_tags: data = re.sub(r"<.*?>", "", content) contents.append(data.strip()) poems = [] # zip参数可放置一个或多个迭代器,并把对应的元素打包成元组 for value in zip(titles, dynasties, authors, contents): title, dynastie, author, content = value poem = { "title": title, "dynastie": dynastie, "author": author, "content": content } poems.append(poem) print(poems) def main(): for x in range(1, 101): url = "https://www.gushiwen.org/default_{}.aspx".format(x) parse_page(url) if __name__ == '__main__': main()
View Code
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:Python爬虫re解析实战 - Python技术站