多线程爬取糗事百科热门段子 (改写前天的博客)
2020-12-13 04:46
标签:clear tree str 请求 web 分组 legend 队列 遍历 多线程爬取糗事百科热门段子 (改写前天的博客) 标签:clear tree str 请求 web 分组 legend 队列 遍历 原文地址:https://www.cnblogs.com/springionic/p/11122261.html利用多线程爬取,除了先前用到的几个模块之外,还需用到threading模块和queue模块:
这两步挺重要:
如果不做这两个步骤,会导致:
经过上面的解释,就直接上代码了:
1 import requests
2 import json
3 import threading
4 from queue import Queue
5 from lxml import etree
6
7
8 class QiubaSpider(object):
9 """爬取糗事百科的热门下的数据"""
10
11 def __init__(self):
12 self.url_temp = ‘https://www.qiushibaike.com/text/page/{}/‘
13 self.headers = {
14 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36‘,
15 }
16 self.url_queue = Queue() # 存放url的队列
17 self.html_queue = Queue() # 存放响应的队列
18 self.content_queue = Queue() # 存放content_list的对列
19
20 def get_url_list(self): # 构造url_list
21 # return [self.url_temp.format(i) for i in range(1, 14)]
22 for i in range(1, 14):
23 self.url_queue.put(self.url_temp.format(i)) # 每一个构造出的url放入队列
24
25 def pass_url(self): # 发送请求
26 while True:
27 url = self.url_queue.get() # 从队列里面取出一个url
28 print(url)
29 response = requests.get(url, headers=self.headers)
30 # return response.content.decode()
31 self.html_queue.put(response.content.decode()) # 将返回的结果放入队列
32 self.url_queue.task_done() # 使计数减一
33 print(1)
34
35 def get_content_list(self): # 提取数据
36 while True:
37 html_str = self.html_queue.get() # 从队列中取出
38 html = etree.HTML(html_str)
39 div_list = html.xpath(‘//div[@id="content-left"]/div‘) # 分组
40 content_list = []
41 for div in div_list:
42 item = {}
43 # 底下全是利用xpath和一些函数对数据的处理
44 item[‘content‘] = div.xpath(‘.//div[@class="content"]/span/text()‘)
45 item[‘content‘] = [i.replace(‘\n‘, ‘‘) for i in item[‘content‘]]
46 item[‘author_gender‘] = div.xpath(‘.//div[contains(@class, "articleGend")]/@class‘)
47 item[‘author_gender‘] = item[‘author_gender‘][0].split(‘ ‘)[-1].replace(‘Icon‘, ‘‘) if len(
48 item[‘author_gender‘]) > 0 else None
49 item[‘author_age‘] = div.xpath(‘.//div[contains(@class, "articleGend")]/text()‘)
50 item[‘author_age‘] = item[‘author_age‘][0] if len(item[‘author_age‘]) > 0 else None
51 item[‘author_img‘] = div.xpath(‘.//div[@class="author clearfix"]//img/@src‘)
52 item[‘author_img‘] = ‘https‘ + item[‘author_img‘][0] if len(item[‘author_img‘]) > 0 else None
53 item[‘stats_vote‘] = div.xpath(‘.//span[@class="stats-vote"]/i/text()‘)
54 item[‘stats_vote‘] = item[‘stats_vote‘][0] if len(item[‘stats_vote‘]) > 0 else None
55 content_list.append(item)
56 # return content_list
57 self.content_queue.put(content_list)
58 self.html_queue.task_done() # 计数减一
59 print(2)
60
61 def save_content_list(self): # 保存
62 while True:
63 content_list = self.content_queue.get() # 获取
64 with open(‘qiuba.txt‘, ‘a‘, encoding=‘utf-8‘) as f:
65 f.write(json.dumps(content_list, ensure_ascii=False, indent=4))
66 f.write(‘\n‘) # 换行
67 self.content_queue.task_done() # 计数减一
68 print(3)
69
70
71 def run(self): # 实现主要逻辑
72 """ 每一件事开启一个线程,现在都是从队列里面获取,不用传参"""
73 thread_list = [] # 用来存取线程,因为四个线程一个个启动太麻烦
74 # 1.构造url_list,热门的一共13页
75 t_url = threading.Thread(target=self.get_url_list)
76 thread_list.append(t_url)
77 # 2.遍历发送请求,获取响应
78 for i in range(5): # 为发送请求这里开启5个线程,直接循环即可
79 t_pass = threading.Thread(target=self.pass_url)
80 thread_list.append(t_pass)
81 # 3.提取数据
82 for i in range(3): # 为提取数据这里开启3个线程
83 t_html = threading.Thread(target=self.get_content_list)
84 thread_list.append(t_html)
85 # 4.保存数据
86 t_save = threading.Thread(target=self.save_content_list)
87 thread_list.append(t_save)
88 for t in thread_list:
89 t.setDaemon(True) # 把子线程设置为守护线程,该线程不重要;主线程结束,子线程结束
90 t.start()
91 for q in [self.url_queue, self.html_queue, self.content_queue]:
92 q.join() # 让主线程等待阻塞,等待队列的任务完成之后再完成
93 print(‘主线程结束!‘)
94
95
96 if __name__ == ‘__main__‘:
97 qiubai = QiubaSpider()
98 qiubai.run()