网页爬虫---音乐

2021-01-22 23:15

阅读：716

标签：findall stat main tar int lse 集中成功 import

import requests
import time
import re
import os

"""歌手字典"""
song_dict = {}

def song_static():
    """采集静态页面url和歌手"""
    try:
        response = requests.get(‘http://www.9ku.com/music/T_Singer.htm‘, timeout=30)
        html = response.text
        reg = r‘(.*?)‘
        static_singer = re.findall(reg, html)
        for ul, title in static_singer:
            url = ‘http://www.9ku.com‘ + ul
            song_dict[title]=url
    except requests.exceptions.Timeout as e:
        print(e)
    except requests.exceptions.HTTPError as e:
        print(e)
    # df = pd.DataFrame(song_list, columns=[‘url‘, ‘歌手‘])
    # df.to_excel(‘歌手url.xlsx‘, engine=‘xlsxwriter‘, index=False)
    return song_dict


#动态歌手地址采集
def song_List():
    """采集动态页面url和歌手"""
    i=2
    print(‘数据采集中......‘)
    try:
        while True:
            print(‘正在采集第{}页数据‘.format(i))
            response = requests.get("http://www.9ku.com/geshou/all-all-all/{}.htm".format(i),timeout=30)
            html = response.text
            reg = r‘(.*?)‘
            data = re.findall(reg,html)
            if len(data):
                i += 1
                for ul ,title in data:
                    url = ‘http://www.9ku.com‘+ul
                    song_dict[title] = url
            else:
                response.close()
                break
    except requests.exceptions.Timeout as e:
        print (e)
    except requests.exceptions.HTTPError as e:
        print (e)
    # df = pd.DataFrame(dynamic_singer,columns=[‘url‘,‘歌手‘])
    # df.to_excel(‘歌手url.xlsx‘,engine=‘xlsxwriter‘,index=False)
    print (‘数据采集完成‘)
    return song_dict


def song_search():
    """歌曲下载"""
    while True:
        name = input("请输入歌手名称：")
        path ="" # 下载保存到哪个目录
        if name in song_dict:
            url = song_dict[name]
            response = requests.get(url,timeout=30)
            html = response.text
            regs = r‘‘
            data = re.findall(regs, html)
            for i in data:
                song_id = i.strip(‘/play/‘)
                url = ‘http://www.9ku.com/down/‘ + song_id
                response = requests.get(url,timeout=30)
                html = response.text
                regs = r‘(.*?)‘
                data = re.findall(regs, html)
                for src, title in data:
                    song_name = title.strip(‘Mp3下载‘)
                    r = requests.get(src,timeout=30).content
                    time.sleep(1)
                    f = open(‘%s/%s.mp3‘ % (path,song_name), ‘wb‘)
                    f.write(r)
                    print(‘{}：下载成功‘.format(song_name))
                    f.close()
        else:
            print("未找到歌手")


if __name__ == ‘__main__‘:
    """采集静态页面数据"""
    song_static()
    """采集动态页面数据"""
    song_List()
    """下载歌曲"""
    song_search()网页爬虫---音乐
标签：findall   stat   main   tar   int   lse   集中   成功   import   
原文地址：https://www.cnblogs.com/sheshidu/p/13282811.html

上一篇：html

下一篇：Web前端工程师就业前景怎么样？整体薪资待遇好不好？

文章来自：搜素材网的编程语言模块，转载请注明文章出处。
文章标题：网页爬虫---音乐
文章链接：http://soscw.com/index.php/essay/45644.html

亲，登录后才可以留言！

网页爬虫---音乐

评论

热门文章

推荐文章

最新文章

置顶文章