1、如何将一个scrapy爬虫项目修改成为一个简单的分布式爬虫项目

官方文档:https://scrapy-redis.readthedocs.io/en/stable/

只用修改scrapy项目的两个文件就可以了

一个是爬虫组件文件# -*- coding: utf-8 -*-

 scrapy
from scrapy_redis.spiders import RedisSpider


# 自定义爬虫类的继承类不再是scrapy.spiders下面的爬虫类,
# 而是scrapy-redis.spiders下面的爬虫类
class DistributedSpiderSpider(RedisSpider):
    name = 'distributed_spider'
    allowed_domains = ['wh.lianjia.com']
    # 将start_urls替换为redis_key
    # start_urls = ['https://wh.lianjia.com/ershoufang/']

# redis_key在数据库中是列表类型,所以需要使用列表的语法添加 redis_key = "myspider:start_urls" def parse(self, response): print(response.meta) item = dict() item["province"] = "湖北" item["city"] = "武汉" blocks = response.xpath("//div[@class='position']/dl[2]/dd/div[1]/div/a") for block in blocks: title = blocks.xpath("./@title").get() url = "https://wh.lianjia.com" + block.xpath("./@href").get() print(title, url, "=====================================") yield scrapy.Request(url=url, callback=self.parse_block, meta={"item": item}) def parse_block(self, response): print(response.meta) url = response.url + "pg%dco32/" index = 1 while index < 100: new_url = url % index print(new_url) yield scrapy.Request(url=new_url, callback=self.parse_item, meta={"item": response.meta["item"]}) index += 1 def parse_item(self, response): print(response.meta) sellLinks = response.css("ul.sellListContent>li>a") for link in sellLinks: url = link.xpath("./@href").get() print(url) yield scrapy.Request(url=url, callback=self.parse_detail, meta={"item": response.meta["item"]}) def parse_detail(self, response): item = response.meta["item"] print(response.meta) item["url"] = response.url item["block"] = response.css("div.areaName>span.info").xpath("./a[1]/text()").get().strip() item["smallBlock"] = response.css("div.areaName>span.info").xpath("./a[2]/text()").get().strip() item["price"] = response.xpath("//span[@class='total']/text()").get().strip() + "" item["unitPrice"] = response.xpath("//span[@class='unitPriceValue']/text()").get().strip() + "元/平米" print(item)

另一个是设置文件(settings.py):

在设置文件中添加几个设置项就可以了

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# hash + set
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

ITEM_PIPELINES = {
   'DistributedSpider.pipelines.DistributedspiderPipeline': 300,
   'scrapy_redis.pipelines.RedisPipeline': 400
}

REDIS_HOST = "localhost"  # 要连接的redis数据库的地址

REDIS_PORT = "6379"  # redis数据库启动时的默认端口

# 默认是db0(redis有16个数据库,从db0~db15

# 如果远程redis数据库设置了密码,需要加下面一项
REDIS_PARAMS = {
  "password": "********"
}