- spider.py文件配置
1 2 # -*- coding: utf-8 -*- 3 import scrapy 4 from itTeachers.items import ItteachersItem 5 6 7 class ItcastSpider(scrapy.Spider): 8 name = 'itcast' 9 allowed_domains = ['itcast.cn'] 10 start_urls = ['http://www.itcast.cn/channel/teacher.shtml#'] 11 12 def parse(self, response): 13 #with open("teacher.html","w") as f: 14 #f.write(response.body) 15 16 items = [] 17 18 teacher_list = response.xpath('//div[@class="li_txt"]') 19 for each in teacher_list: 20 21 #我们将得到的数据封装到一个'ItcastItem'对象 22 item = ItteachersItem() 23 name = each.xpath('h3/text()').extract() 24 title = each.xpath('h4/text()').extract() 25 info = each.xpath('p/text()').extract() 26 27 #xpath返回的是包含一个元素的列表 28 item['name'] = name[0] 29 item['title'] = title[0] 30 item['info'] = info[0] 31 32 items.append(item) 33 #直接返回最后数据 34 return items ~
- items.py文件配置
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class ItteachersItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 name = scrapy.Field() 15 title = scrapy.Field() 16 info = scrapy.Field()
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:scrapy crawl itcast -o teachers.json 爬虫案列 - Python技术站