scrapy爬虫框架
简介
通过实战快速入门scrapy爬虫框架
scrapy爬虫框架入门简介
下载scrapy
pip install scrapy
创建项目
scrapy startproject spiderTest1
创建爬虫
cd .\spiderTest1
scrapy genspider douban movie.douban.com
将项目拖入pycharm
添加虚拟环境(python每一个项目都应当有专属的虚拟环境)
设置->项目->python解释器
不建议使用全局的python解释器
点齿轮->点添加,将虚拟环境放在项目路径下起名venv
项目目录
在虚拟环境中,再次安装scrapy 三方库
设置settings.py文件
# 设置请求头,伪装成浏览器
USER_AGENT = 'Mozilla/5.0(Macintosh;intel Mac OS X 10_14_6)AppleWebKit/537.36(KHTML,like Gecko)Chrome/92.0.4515.159 Safari/537.36'
ROBOTSTXT_OBEY = True # 是否遵守爬虫协议
CONCURRENT_REQUESTS = 2 # 设置并发
DOWNLOAD_DELAY = 2 # 下载延迟
RANDOMIZE_DOWNLOAD_DELAY = True #随机延迟
# 当有多个管道,数字大的先执行,数字小的后执行
ITEM_PIPELINES = {
'spiderTest1.pipelines.ExcelPipeline': 300,
'spiderTest1.pipelines.AccessPipeline': 200,
}
运行爬虫
scrapy crawl spiderName --nolog # --nolog不显示日志
scrapy crawl spiderName -o douban.csv # 运行爬取数据保存为csv格式
三方库
# python往excel写数据
pip install openpyxl
# python操作Access数据库
pip install pyodbc
# python操作mysql数据库
pip install pymysql
查看已经安装了那些库
pip list
pip freeze # 依赖清单
将依赖清单输出requirements.txt保存
# >输出重定向
pip freeze > requirements.txt
按依赖清单装依赖项
pip install -r requirements.txt
网页爬虫代码
douban.py
import scrapy
from scrapy import Selector, Request
from scrapy.http import HtmlResponse
from spiderTest1.items import movieItem
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250']
def start_requests(self):
for page in range(10):
# f格式化
yield Request(url=f'https://movie.douban.com/top250?start={page * 25}&filter=')
def parse(self, response: HtmlResponse, **kwargs):
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')
for list_item in list_items:
movie_item = movieItem()
movie_item['title'] = list_item.css('span.title::text').extract_first()
movie_item['rank'] = list_item.css('span.rating_num::text').extract_first()
movie_item['subject'] = list_item.css('span.inq::text').extract_first()
yield movie_item
# 找到超链接爬取url
# hrefs_list = sel.css('div.paginator > a::attr(href)')
# for href in hrefs_list:
# url = response.urljoin(href.extract())
# yield Request(url=url)
在管道文件将数据库写入excel,数据库等
piplines.py
import openpyxl
# import pymysql
import pyodbc
# 写入access数据库
class AccessPipeline:
def __init__(self):
# 链接数据库
db_file = r"E:\left\Documents\spider.accdb" # 数据库文件
self.conn = pyodbc.connect(
r"Driver={Microsoft access Driver (*.mdb, *.accdb)};DBQ=" + db_file + ";Uid=;Pwd=;charset='utf-8';")
# 创建游标
self.cursor = self.conn.cursor()
# 将数据放入容器进行批处理操作
self.data = []
def close_spider(self, spider):
self._write_to_db()
self.conn.close()
def _write_to_db(self):
sql = r"insert into tb_top_movie (title, rating, subject) values (?, ?, ?)"
if len(self.data) > 0:
self.cursor.executemany(sql, self.data)
self.conn.commit()
# 清空原列表中的数据
self.data.clear()
# 回调函数 -->callback
def process_item(self, item, spider):
title = item.get('title', '')
rank = item.get('rank', '')
subject = item.get('subject', '')
# 单条数据插入,效率较低
# sql = "insert into [tb_top_movie] (title, rating, subject) values('"+title+"','"+rank+"','"+subject+"')"
# self.cursor.execute(sql)
# 批处理插入数据
self.data.append((title, rank, subject))
if len(self.data) == 50:
self._write_to_db()
return item
# 写入Excel
class ExcelPipeline:
def __init__(self):
self.wb = openpyxl.Workbook() # 工作簿
self.ws = self.wb.active # 工资表
self.ws.title = 'Top250'
self.ws.append(('标题', '评分', '主题'))
def close_spider(self, spider):
self.wb.save('电影数据.xlsx')
# 回调函数 -->callback
def process_item(self, item, spider):
title = item.get('title', '')
rank = item.get('rank', '')
subject = item.get('subject', '')
self.ws.append((title, rank, subject))
return item
# 写入mysql
# class DBPipeline:
# def __init__(self):
# # 链接数据库
# self.conn = pymysql.connect(host='localhost', port=9555,
# user='left', passwd='123',
# database='spider', charset='utf8mb4')
# # 创建游标
# self.cursor = self.conn.cursor()
#
# def close_spider(self, spider):
# self.conn.commit()
# self.conn.close()
#
# # 回调函数 -->callback
# def process_item(self, item, spider):
# title = item.get('title', '')
# rank = item.get('rank', '')
# subject = item.get('subject', '')
# self.cursor.execute(
# 'insert into tb_top_movie (title, rating, subject) value(%s,%s,%s)',
# (title, rank, subject)
# )
# return item
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:快速入门scrapy爬虫框架 - Python技术站