Python搜索B站视频并且爬取
2021-03-06 11:29
标签:安装 ica content down video webp iter div encoding 目前遍历循环仍未完成,所以只会下载第一个结果,后续会完善成接口可以做单独调用,其中还有音频和视频的合并,需要先安装ffmpeg环境 Python搜索B站视频并且爬取 标签:安装 ica content down video webp iter div encoding 原文地址:https://www.cnblogs.com/duanminkid/p/14300350.html# -*- coding: utf-8 -*-
import requests
from urllib import parse,request
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import subprocess
import time
import json
import sys
import io
import ffmpeg
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding=‘utf-8‘)
class BiliBili(object):
def __init__(self, url):
self.url = url
def html(self, url):
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43‘,
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘,
‘Connection‘: ‘keep-alive‘,
}
html = requests.get(url, headers=headers)
html = html.text
#print(html)
return html
def get_video_html(self,url):
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43‘,
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘,
‘Connection‘: ‘keep-alive‘,
# ‘Cookie‘:cookie
}
response = requests.get(url, headers=headers)
video_html = response.text
return video_html
def get_video_info(selfs, html):
result = re.findall(‘‘, html, re.S)[0]
html_data = json.loads(result)
download_video_url = html_data[‘data‘][‘dash‘][‘video‘][0][‘backup_url‘][0]
return download_video_url
def get_audio_info(self, html):
result = re.findall(‘‘, html, re.S)[0]
html_data = json.loads(result)
download_audio_url = html_data[‘data‘][‘dash‘][‘audio‘][0][‘backup_url‘][0]
return download_audio_url
def search_video_info(self, html):
soup = BeautifulSoup(html,"html.parser")
for tag in soup.find_all(‘div‘, class_=‘info‘):
title = tag.find(‘a‘,class_=‘title‘).get_text()
people_num = tag.find(‘span‘, class_=‘so-icon watch-num‘).get_text()
up_name = tag.find(‘a‘,class_=‘up-name‘).get_text()
video_url = tag.find(‘a‘).get(‘href‘)
video_url = video_url.replace(‘//‘,‘‘)
return title, video_url
def search_video(self, html):
title, video_url = self.search_video_info(html)
print(title)
print(video_url)
print(url)
self.run_video(title, video_url, url)
def run_search(self):
#获取搜索结果,根据搜索结果获得视频链接
html =self.html(url)
self.search_video(html)
def run_video(self,title, video_url,url):
# 根据结果传入来获得视频下载链接
video_size = 0
audio_size = 0
print("视频名称:" + title)
print(url)
print(‘https://‘+ video_url)
get_video_html = self.get_video_html(‘https://‘+ video_url)
download_video_url = self.get_video_info(get_video_html)
download_audio_url = self.get_audio_info(get_video_html)
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0‘,
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
‘Referer‘: ‘https://‘+ video_url,
‘Accept-Encoding‘:"gzip, deflate, br",
‘Accept-Language‘: ‘zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2‘,
‘Connection‘: ‘keep-alive‘,
#‘Cookie‘:cookie
}
video_content = requests.get(download_video_url, stream=True, headers=headers)
mp4_file_size = int(video_content.headers[‘content-length‘])
if video_content.status_code == 200:
print(‘[文件大小]:%0.2f MB‘ %(mp4_file_size / 1024 / 1024))
with open(title + ‘.mp4‘, mode=‘wb‘) as mp4:
for chunk in video_content.iter_content(chunk_size=1024):
if chunk:
mp4.write(chunk)
video_size += len(chunk) # 已下载的文件大小
audio_content = requests.get(download_audio_url, stream=True, headers=headers)
mp3_file_size = int(audio_content.headers[‘content-length‘])
if audio_content.status_code == 200:
print(‘[文件大小]:%0.2f MB‘ % (mp3_file_size / 1024 / 1024))
with open(title + ‘.mp3‘, mode=‘wb‘) as mp3:
for chunk in audio_content.iter_content(chunk_size=1024):
if chunk:
mp3.write(chunk)
audio_size += len(chunk)
print(‘正在保存:‘, title)
self.video_audio_merge_single(title)
def video_audio_merge_single(self, video_name):
#合成视频
print(‘视频合成开始:‘,video_name)
ffm = r"D:\sofware\ffmpeg-4.3.1-2021-01-01-full_build\bin\ffmpeg.exe "
command = ffm + ‘ -i "{}.mp4" -i "{}.mp3" -vcodec copy -acodec copy "{}.mp4"‘.format(
video_name, video_name, video_name + ‘(合)‘)
subprocess.Popen(command, shell=True)
print(command)
time.sleep(10)
print("视频合成结束:", video_name)
if __name__ ==‘__main__‘:
url = ‘https://search.bilibili.com/all?‘
keyword = ‘哈哈哈哈哈‘#需要搜索的视频名称
keyword = urllib.parse.quote(keyword)
param = ‘keyword=‘ + keyword + ‘&from_source=nav_searchs&pm_id_from=333.851.b_696e7465726e6174696f6e616c486561646572.15‘
url = url + param
BB = BiliBili(url)
BB.run_search()