Scrapy 采集需要登录注册的网站
2021-03-28 07:27
标签:self ref htm code style ati 地址 get wls Scrapy 采集需要登录注册的网站 标签:self ref htm code style ati 地址 get wls 原文地址:https://www.cnblogs.com/funsion/p/12623276.html#!/usr/bin/py2
# -*- coding: utf-8 -*-
#encoding=utf-8
from bs4 import BeautifulSoup
from scrapy.http import Request, FormRequest
from spider_test.items import *
from scrapy.spiders import CrawlSpider
from spider_test import settings
# @author Funsion Wu
class ScrapyTestSpider(CrawlSpider):
name = "spider_test"
allowed_domains = [settings.SPIDER_DOMAIN]
def start_requests(self):
"""第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数"""
yield Request(‘http://%s/admin/account/login.html‘ % settings.SPIDER_DOMAIN, meta={‘cookiejar‘: 1}, callback=self.parse)
def parse(self, response):
data = dict(username="xiaoming", # 登录页表单的账号字段
password="888888") # 登录页表单的密码字段
print(‘登录中....!‘)
"""第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权"""
yield FormRequest(url=‘http://%s/admin/account/dologin.html‘ % settings.SPIDER_DOMAIN, # 真实post地址
meta={‘cookiejar‘: 1},
formdata=data,
callback=self.jump_office_list)
def jump_office_list(self, response):
print(‘正在请需要登录才可以访问的页面....!‘)
yield Request(‘http://%s/admin/office/getofficelist.html‘ % settings.SPIDER_DOMAIN,
meta={‘cookiejar‘: 1}, callback=self.parser_office_list)
def parser_office_list(self, response):
soup = BeautifulSoup(response.body, ‘html.parser‘)
page_list = soup.find(attrs={‘class‘: ‘pagination‘}).find_all(‘a‘)
if page_list:
for page in page_list:
page_url = ‘http://%s%s‘ % (settings.SPIDER_DOMAIN, page.get(‘href‘))
yield Request(page_url, meta={‘cookiejar‘: 1}, callback=self.parser_office_list)
office_list = soup.find_all(‘a‘, attrs={‘class‘: ‘ui-office-list‘})
if office_list:
for office in office_list:
office_url = ‘http://%s%s‘ % (settings.SPIDER_DOMAIN, office.attrs[‘href‘])
yield Request(office_url, meta={‘cookiejar‘: 1}, callback=self.parse_article)
def parse_article(self, response):
test_item = SpiderTestItem()
soup = BeautifulSoup(response.body, ‘html.parser‘)
container = soup.find(‘table‘, attrs={‘class‘: ‘index-statistics-table‘})
test_item[‘source_url‘] = response.url
test_item[‘title‘] = soup.title.get_text()
test_item[‘article_content‘] = container.prettify()
return test_item
上一篇:JS作用域