问题:想在启动scrapy后重复爬取某一天的数据,但是爬取之前需要删除掉之前的旧数据,在哪里实现删除呢?
可以在pipeline的open_spider(self,spider)中删除,则在爬虫启动的时候会删除。
以下是pipelines.py 文件
# -*- coding: utf-8 -*- import sys sys.path.append("/apps/jr_python/riskspiders") from riskspiders.utils import DButil from riskspiders.settings import DATABASE_PRM import logging import hashlib logger = logging.getLogger(__name__) # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class RiskspidersPipeline(object): # 连接数据库,只要类一初始化,就已经连好了数据库 db = DButil(DATABASE_PRM) def process_item(self, item, spider): return item class RiskspidersMySQLPipeline(object): # 连接数据库,只要类一初始化,就已经连好了数据库 # def __init__(self): # self.md = hashlib.md5() def open_spider(self, spider): print("open_spider, %s" % spider.name) self.db = DButil(DATABASE_PRM) for day in spider.day_list: sql_del = """delete from riskinfo where spider = '{}' and release_time = '{}';""".format(spider.name,day) try: self.db.execute(sql_del) except Exception as e: print(e) def close_spider(self,spider): self.db.close() # 以下可以打印大部分的数据收集,但是finish_time等不能输出,因为程序还没有运行完 print(spider.crawler.stats.get_value()) def process_item(self,item,spider): db = DButil(DATABASE_PRM) # 逐条插入,更新插入 if spider.name == 'hexun_bankdata': # print('***** item_bank insert MySQL') logger.info('***** item_bank insert MySQL') pa = ( item["source"], item["spider"],item['website_menu'], item["disclosure_period"], item["bank_abbreviation"], item["total_assets"], item["capital_adequancy_ratio"], item["core_capital_adequancy_ratio"], item["bad_loan_ratio"], item["provision_coverage"], item["url"], item["cra_time"], item["cra_time"]) sql_data = \ """insert into hexun_bankdata(source,spider,website_menu,disclosure_period, bank_abbreviation,total_assets,capital_adequancy_ratio,core_capital_adequancy_ratio,bad_loan_ratio,provision_coverage,url,cra_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update cra_time = %s;""" try: db.execute(sql_data, pa) except Exception as e: print e logger.error(e) finally: db.close() else: md = hashlib.md5() str1 = '%s%s' % (item['title'], item['content']) md.update(str1) md_value = md.hexdigest() # print("str1 is %s,md_value is %s" % (str1,md_value)) logger.info('***** item_bank insert MySQL') params = ( item['source'], item['spider'],item['website_menu'], item['release_time'], item['key_words'], item['neg_key_words'], item['title'].strip(), item['source_type'], item['f_name'], item['is_include_tbl'], item['content'].strip(), item['content_web'], item['url'], item['father_url'], item['cra_time'], md_value, item['cra_time'] ) try: db.execute( """ insert into riskinfo (source, spider,website_menu, release_time, key_words,neg_key_words, title, source_type,f_name, is_include_tbl,content,content_web, url,father_url,cra_time,content_id) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update cra_time = %s; """, params ) except Exception as e: print e logger.error(e) finally: db.close()
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:scrapy在重复爬取的时候删除掉之前爬的旧数据,在爬虫结束的时候收集统计信息 - Python技术站