lxml webdriver 抓取街拍
2021-01-29 03:16
标签:api ash from exists img sel 图片 import selenium 案例 lxml webdriver 抓取街拍 标签:api ash from exists img sel 图片 import selenium 原文地址:https://www.cnblogs.com/luweiweicode/p/14335595.htmlimport os
from hashlib import md5
from selenium import webdriver
import requests
from lxml import etree
# 首页请求
def get_response(url):
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}
res = requests.get(url, headers=headers)
return res
# 从返回的json中解析出组图链接
def get_article_title_url(text, i):
# json解析获取字典表数据
article_url = text.json()["data"][i][‘article_url‘]
title = text.json()["data"][i][‘title‘]
return article_url,title
# 从单个组图链接里解析出每张图片的URL地址-自上而下
def parse_article_url(article_url):
driver = webdriver.Chrome(r"D:\python\com\zxsoft\python\chromedriver.exe")
driver.get(article_url)
text = driver.page_source
html = etree.HTML(text)
hrefs = html.xpath(‘//div[@class="article-content"]//div[@class="pgc-img"]//img[@class="syl-page-img"]//@src‘)
driver.close()
return hrefs
# 将每张图片保存在对应标题的本地文件夹下
def save_jpg(title,href):
res = requests.get(href)
file_path = ‘{}/{}.{}‘.format(title, md5(res.content).hexdigest(), ‘jpg‘)
with open(file_path, ‘wb‘) as f:
f.write(res.content)
os.chdir(r"E:/ntmssFile/nv/")
for i in range(20):
url = ‘https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab‘.format(i * 20)
r = get_response(url)
data_length = r.json()["data"]
for i in range(len(data_length)):
try: # 不是所有的列表中都有组图标题和链接信息,用try防止报错
article_url,title_text = get_article_title_url(r, i)
if not os.path.exists(title_text):
os.makedirs(title_text)
hrefs = parse_article_url(article_url)
for href in hrefs:
save_jpg(title_text,href)
except:
continue
下一篇:Less.js用法
文章标题:lxml webdriver 抓取街拍
文章链接:http://soscw.com/index.php/essay/48499.html