import pymongo import requests import random import time import pymysql db = pymongo.MongoClient()['cs']['dn'] db1 = pymysql.connect(user='root',password='root',db='cs',charset='utf8') cursor = db1.cursor() class CsdnPipeline(object): def __init__(self): self.set = set() def process_item(self, item, spider): if item not in self.set: title = item['title'] content_text = item['content_text'] create_time_datetime = item['create_time_datetime'] nickName = item['nickName'] read_count = item['read_count'] content_img = item['content_img'] keyword = item['keyword'] if len(content_img)>0: path = [] for img in content_img: img_name = 'F:\\34\\tu\\'+str(time.time()).split('.')[1]+str(random.randrange(1,9999999999999999999999999))+'.jpg' img_source = requests.get(img).content op = open(img_name,'wb') op.write(img_source) op.close() path.append(img_name) item['content_img'] = path else: item['content_img'] = '暂无图片' db.insert(dict(item)) import json data = json.dumps(dict(item)) sql = "insert into dn1(`data`) VALUES ('{}')".format(data) cursor.execute(sql) db1.commit() self.set.add(item) return item else: print('已经存在') return item
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python scrapy爬虫存储数据库方法带去重步骤 - Python技术站