python 爬取百度网盘分享动态
2021-01-20 14:14
标签:onclick commit 字典 动态 line 判断 user char bsp 我之前写的一份爬虫,在百度网盘没有改版之前,有很多资源达人在他们的百度网盘动态分享自己的资源,后来我关注了一批分享影视资源的账号,程序定时去爬取他们的动态,将他们分享出来的百度网盘链接收入自己的数据库,写入数据库之前查询资源是否重复和不良关键词过滤,然后在另一端网页或APP,将数据库的资源展示出来,早期市面上的网盘资源搜索就是运用了这个原理,因为后来百度网盘改版,取消动态分享,程序目前已经无法正常运行,本文做个思路记录。 程序主入口,实现爬取百度网盘动态分享的功能都写在这个文件了,还负责调用其他文件函数,运行这个脚本就可以不间断的执行 数据写入数据库和百度网盘失效链接检测删除函数,将爬取到的数据传入函数即可写入数据库,还有一个链接失效检测函数,链接失效很正常,这个函数对整个数据库的链接进行检测,如果失效的链接删除。 这里还有个小函数,如果我们爬取的资源标题包含敏感词则不写入数据库,主要过滤广告 这里写了一个拓展函数,去爬取其他网站的函数,动态分享获取到的资源或许不够,这里可以多渠道爬取其他网站,从而可以建立一个更加全面的百度网盘资源搜索 数据库的设计比较简单,只做了两个表,可以看看写入数据库函数那部分。 python 爬取百度网盘分享动态 标签:onclick commit 字典 动态 line 判断 user char bsp 原文地址:https://www.cnblogs.com/hongming/p/12901445.html# 主程序
import requests,re, json, time
import random
from mysql_db import *
import threading
from aidy_pc import *
from yszx import *
from defs import *
header = {
"Cookie": "",
"Host": "pan.baidu.com",
"Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
# 获取账号订阅的id
list_uk = [‘2489863899‘]
def getShareUser():
start = 0
for star in range(100):
try:
url = ‘https://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=2489863899&limit=24&start=%d‘ % start
follows_json = (requests.get(url, headers=header)).json()
if len(follows_json[‘follow_list‘]) == 0: # 如果没有返回数据侧退出
break
lists = follows_json[‘follow_list‘] # 全部信息列表
for i in lists: # 遍历分离出每个订阅用户的信息
list_uk.append(i[‘follow_uk‘]) # 添加uk到列表
start = start + 24
time.sleep(random.randint(10, 25))
except:
continue
# if list_uk == ‘‘:
# return False
# else:
# return list_uk
# 程序开始
def gethtml(): # 爬取网盘资源函数
tu = getShareUser() # 这里是去获取我订阅的账号id
if tu == False: # 如果获取不到订阅列表,则退出
pass
else:
start = 0
for uk in list_uk: # 循环订阅id
for n in range(2): # 循环翻页
url = "https://pan.baidu.com/pcloud/feed/getdynamiclist?auth_type=1&filter_types=11000&query_uk=%s&category=0&limit=25&start=%s&bdstoken=29b0093f2c23b7afd5f41c39f57be34e&channel=chunlei&clienttype=0&web=1" % (
uk, start)
filelist_json = requests.get(url, headers=header).json()
if filelist_json[‘errno‘] != 0:
break
list_records = filelist_json[‘records‘] # 本次请求的所有资源列表
for data_vaule in list_records: # 遍历资源列表里的所有字典
if data_vaule[‘category‘] == 3: # 不要图片
pass
if gjc_gl(data_vaule[‘title‘])==False: # 关键词过滤
pass
else:
#print(data_vaule[‘title‘])
print(data_vaule)
#mysql_into(data_vaule) # 开始写入数据库
##print(data_vaule) # 文件类型:category(文件夹6,视频1,图片3) 链接:shorturl 标题:title 时间:feed_time
start = start + 25
time.sleep(random.randint(10, 25))
if __name__ == ‘__main__‘:
while True:
try:
gethtml() # 网盘爬虫函数
t1 = threading.Thread(target=bdsl) # 网盘失效连接检测函数
#t2 = threading.Thread(target=aidy) # 爱电影网站爬虫函数
#t3 = threading.Thread(target=main_ys)
t1.start()
#t2.start()
#t3.start()
time.sleep(10800) # 每3个小时爬一次,一天爬4次
except:
continue
# 对数据库进行连接与数据入库
import pymysql,time
import requests,re
import random
def pysql():
try:
mysql = pymysql.connect(‘127.0.0.1‘, ‘bdwp‘, ‘xDnwLnjSEXLbGJYa‘, ‘bdwp‘, charset="utf8")
#mysql = pymysql.connect(‘127.0.0.1‘, ‘root‘, ‘root‘, ‘bdwp‘, charset="utf8")
return mysql
except:
print("数据库连接失败!")
exit()
def mysql_into(data_vaule): # 网盘数据添加数据库函数
mysql = pysql()
db = mysql.cursor()
sqlcx = "select title from data_zy WHERE title=‘%s‘"%data_vaule[‘title‘]
db.execute(sqlcx)
data = db.fetchall()
if not data: # 没有的时候执行
sqlcxid = "select max(id) from data_zy"
db.execute(sqlcxid)
dataid = db.fetchall()
ids = (int(dataid[0][0])) + 1 # 获取最后一个入库id
time_time = time.strftime("%Y-%m-%d %H:%M", time.localtime()) # 获取入库时间
timeStamp = data_vaule[‘feed_time‘] # 转换资源分享时间
timeStamp = float(timeStamp / 1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
try:
sqltj = "insert into data_zy (id,category,shorturl,title,feed_time,rk_time) VALUES (‘%d‘,‘%d‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % (ids,data_vaule[‘category‘], data_vaule[‘shorturl‘],data_vaule[‘title‘],otherStyleTime,time_time)
db.execute(sqltj)
mysql.commit()
except:
pass
else:
return False # 数据库里存在文件时
mysql.close()
# 百度链接失效检测函数
def bdsl():
header = {
"Host": "pan.baidu.com",
"Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
mysql = pysql()
db = mysql.cursor()
sqlcx = "select id,shorturl from data_zy"
db.execute(sqlcx)
data = db.fetchall()
#查询完成
for r in data:
url = "https://pan.baidu.com/s/"+r[1]
id = r[0]
html = (requests.get(url, headers=header).text).encode(‘iso-8859-1‘).decode(‘utf-8‘)
srt = "此链接分享内容可能因为涉及侵权、色情、反动、低俗等信息,无法访问!"
if srt in html:
sqlde = "DELETE FROM data_zy WHERE id = %s" % id
db.execute(sqlde)
time.sleep(random.randint(10, 25))
else:
pass
from mysql_db import pysql
def gjc_gl(title):
mysql = pysql()
db = mysql.cursor()
sql = "select * from gjc_gl WHERE id=1"
db.execute(sql)
data = db.fetchall()[0][1]
data = data.split(‘,‘)
for trs in data:
if trs in title:
return False
else:
pass
return True
# import os
# import binascii
# cats = {
# u‘video‘: u‘视频‘,
# u‘image‘: u‘图片‘,
# u‘document‘: u‘书籍‘,
# u‘music‘: u‘音乐‘,
# u‘package‘: u‘压缩‘,
# u‘software‘: u‘软件‘,
# }
#
# def get_label(name):
# if name in cats:
# return cats[name]
# return u‘其它‘
#
# # 函数用途,根据传入的文件名后缀而判断文件类型
# def get_category(ext):
# ext = ext + ‘.‘
# cats = {
# u‘video‘: ‘.avi.mp4.rmvb.m2ts.wmv.mkv.flv.qmv.rm.mov.vob.asf.3gp.mpg.mpeg.m4v.f4v.‘,
# u‘image‘: ‘.jpg.bmp.jpeg.png.gif.tiff.‘,
# u‘document‘: ‘.pdf.isz.chm.txt.epub.bc!.doc.docx.xlsx.xls.pptx.ppt.‘,
# u‘music‘: ‘.mp3.wma.ape.wav.dts.mdf.flac.‘,
# u‘package‘: ‘.zip.rar.7z.tar.gz.iso.dmg.pkg.‘,
# u‘software‘: ‘.exe.app.msi.apk.‘,
# u‘torrent‘: ‘.torrent.‘
# }
# for k, v in cats.items():
# if ext in v:
# return get_label(k) # 调用
# return ‘其他‘
import requests,re,time
import random
import pymysql
from mysql_db import pysql
def aidy():
for i in range(11, 24): # 1000
for r in range(1, 6):
try:
url = "http://520.58801hn.com/%d/page/%d" % (i, r)
header = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Mobile Safari/537.36"}
html = requests.get(url, headers=header).text
re_url = re.findall(‘