python-scrapy-增量式
2021-03-06 23:28
标签:crawl false pytho 允许 管道 pid The art lse movie.py rules = ( coon = Redis(host=‘127.0.0.1‘,port=6379) def parse_item(self, response): settings.py # 指定管道 # 指定数据库 items.py class ZlsproItem(scrapy.Item): 运行项目 scrapy crawl movie python-scrapy-增量式 标签:crawl false pytho 允许 管道 pid The art lse 原文地址:https://www.cnblogs.com/shiyi525/p/14286167.htmlimport scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from zlsPro.items import ZlsproItem
from redis import Redis
class MovieSpider(CrawlSpider):
name = ‘movie‘
start_urls = [‘https://www.4567kan.com/index.php/vod/show/id/1.html‘]
Rule(LinkExtractor(allow=r‘/index\.php/vod/show/id/1/page/\d+\.html‘), callback=‘parse_item‘, follow=True),
)
li_list = response.xpath(‘//div[1]/div/div/div/div[2]/ul/li‘)
for li in li_list:
title = li.xpath(‘./div/div/h4/a/text()‘)[0].extract()
href = ‘https://www.4567kan.com‘ + li.xpath(‘./div/div/h4/a/@href‘)[0].extract()
item = ZlsproItem()
item[‘title‘] = title
item[‘href‘] = href
ex = self.coon.sadd(‘movie_url‘,href)
if ex == 1:
print(‘有新增‘)
yield scrapy.Request(url=href,callback=self.parse_href,meta={‘item‘: item})
else:
print(‘暂无新增‘)
def parse_href(self,response):
detail = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first()
item = response.meta[‘item‘]
item[‘detail‘] = detail
yield itemUSER_AGENT = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36‘
ROBOTSTXT_OBEY = False
LOG_LEVEL = ‘ERROR‘# 使用scrapy-redis组件的去重队列
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允许暂停
SCHEDULER_PERSIST = True
ITEM_PIPELINES = {
‘scrapy_redis.pipelines.RedisPipeline‘: 400
}
REDIS_HOST = ‘127.0.0.1‘
REDIS_PORT = 6379import scrapy
# define the fields for your item here like:
title = scrapy.Field()
href = scrapy.Field()
detail = scrapy.Field()
上一篇:Java获取指定类的内部结构
下一篇:【VBA】日期时间