python-scrapy-增量式

2021-03-06 23:28

阅读：947

标签：crawl false pytho 允许管道 pid The art lse

movie.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from zlsPro.items import ZlsproItem
from redis import Redis
class MovieSpider(CrawlSpider):
    name = ‘movie‘
    start_urls = [‘https://www.4567kan.com/index.php/vod/show/id/1.html‘]
    rules = (
        Rule(LinkExtractor(allow=r‘/index\.php/vod/show/id/1/page/\d+\.html‘), callback=‘parse_item‘, follow=True),
    )
    coon = Redis(host=‘127.0.0.1‘,port=6379)
    def parse_item(self, response):
        li_list = response.xpath(‘//div[1]/div/div/div/div[2]/ul/li‘)
        for li in li_list:
            title = li.xpath(‘./div/div/h4/a/text()‘)[0].extract()
            href = ‘https://www.4567kan.com‘ + li.xpath(‘./div/div/h4/a/@href‘)[0].extract()
            item = ZlsproItem()
            item[‘title‘] = title
            item[‘href‘] = href
            ex = self.coon.sadd(‘movie_url‘,href)
            if ex == 1:
                print(‘有新增‘)
                yield scrapy.Request(url=href,callback=self.parse_href,meta={‘item‘: item})
            else:
                print(‘暂无新增‘)

    def parse_href(self,response):
        detail = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first()
        item = response.meta[‘item‘]
        item[‘detail‘] = detail
        yield item

settings.py

USER_AGENT = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36‘
ROBOTSTXT_OBEY = False
LOG_LEVEL = ‘ERROR‘

# 使用scrapy-redis组件的去重队列
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允许暂停
SCHEDULER_PERSIST = True

# 指定管道
ITEM_PIPELINES = {
‘scrapy_redis.pipelines.RedisPipeline‘: 400
}

# 指定数据库
REDIS_HOST = ‘127.0.0.1‘
REDIS_PORT = 6379

items.py

import scrapyclass ZlsproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    href = scrapy.Field()
    detail = scrapy.Field()
运行项目 scrapy crawl movie

python-scrapy-增量式

标签：crawl false pytho 允许管道 pid The art lse

原文地址：https://www.cnblogs.com/shiyi525/p/14286167.html

上一篇：Java获取指定类的内部结构

下一篇：【VBA】日期时间

文章来自：搜素材网的编程语言模块，转载请注明文章出处。
文章标题：python-scrapy-增量式
文章链接：http://soscw.com/index.php/essay/61075.html

亲，登录后才可以留言！

python-scrapy-增量式

评论

热门文章

推荐文章

最新文章

置顶文章