简单百度贴吧网页爬取get方式
2021-04-02 17:25
标签:win64 inline res http 文件夹 exists 信息 coding write 简单百度贴吧网页爬取get方式 标签:win64 inline res http 文件夹 exists 信息 coding write 原文地址:https://www.cnblogs.com/lizhihoublog/p/11206770.html 1 from urllib import request,parse
2 import os
3 # https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=0
4 # https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=50
5 # https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=100
6 # https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=150
7 # 分析:
8 # kw是搜索关键字pn是它的网页序号
9 # 所以1---0
10 # 2---50
11 # 3---100
12 # n...(n-1)*50
13
14
15 def query(base_url,kw,start,end):
16 ‘‘‘
17 :param base_url: 网址
18 :param kw: 搜索贴吧关键字
19 :param start: 起始页码
20 :param end: 终止页码
21 :return: 输出网页信息到指定文件夹
22 ‘‘‘
23 dir_name = ‘./teiba/‘+kw+‘/‘
24 header = {
25 ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0‘
26 }
27 if not os.path.exists(dir_name):
28 os.makedirs(dir_name)
29 for i in range(start,end+1):
30 pn = (i-1)*50
31 query_msg = {
32 ‘kw‘:kw,
33 ‘ie‘:‘utf-8‘,
34 ‘pn‘:pn,
35 }
36 msg = parse.urlencode(query_msg)
37 url = base_url+msg
38 req = request.Request(url,headers=header)
39 response = request.urlopen(req).read().decode()
40 with open(dir_name+str(i)+‘.html‘,‘w‘,encoding=‘utf-8‘)as files:
41 files.write(response)
42
43 if __name__ == ‘__main__‘:
44 base_url = ‘https://tieba.baidu.com/f?‘
45 kw = input(‘输入查询的贴吧>>‘)
46 start = int(input(‘输入起始页码>>‘))
47 end = int(input(‘输入终止页码>>‘))
48 query(base_url,kw,start,end)
上一篇:js 图片上传传给后台的3种格式
下一篇:webpack中的~