python-scrapy深度爬取
2021-01-15 03:14
标签:project elf pipe dia home request ret app into 爬取电影网站 movie.py class MovieSpider(scrapy.Spider): url = ‘https://www.1905.com/vod/list/n_1_t_1/o3p%d.html‘ def parse(self, response): def parse_href(self,response): items.py class MyprojectdianyingItem(scrapy.Item): settings.py pipelines.py def process_item(self, item, spider): href = item["href"] def close_spider(self,spider): python-scrapy深度爬取 标签:project elf pipe dia home request ret app into 原文地址:https://www.cnblogs.com/shiyi525/p/14274049.htmlimport scrapy
from MyProjectDianying.items import MyprojectdianyingItem
name = ‘movie‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘https://www.1905.com/vod/list/n_1_t_1/o3.html?fr=vodhome_js_lx‘]
page = 2
divs = response.xpath(‘//*[@id="content"]/section[4]/div‘)
for div in divs:
href = div.xpath(‘./a/@href‘)[0].extract()
title = div.xpath(‘./a/@title‘)[0].extract()
item = MyprojectdianyingItem()
item["href"] = href
item["title"] = title
print(title)
yield scrapy.Request(href, callback=self.parse_href, meta={‘item‘: item})
if self.page url = format(self.url % self.page)
yield scrapy.Request(url,callback=self.parse)
self.page += 1
detail = response.xpath(‘//*[@id="playerBoxIntroCon"]/text()‘)[0].extract()
item = response.meta[‘item‘]
item["detail"] = detail
yield itemimport scrapy
# define the fields for your item here like:
href = scrapy.Field()
title = scrapy.Field()
detail = scrapy.Field()USER_AGENT = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36‘
ROBOTSTXT_OBEY = False
LOG_LEVEL = ‘ERROR‘ITEM_PIPELINES = {
‘MyProjectDianying.pipelines.MyprojectdianyingPipeline‘: 300,
}class MyprojectdianyingPipeline:
fp = None
def open_spider(self,spider):
self.fp = open(‘dianying.txt‘, mode=‘w‘, encoding=‘utf-8‘)
title = item["title"]
detail = item["detail"]
self.fp.write(title+href+detail+‘\n‘)
return item
self.fp.close()