爬取房天下整个网站房产数据。。。
2021-05-15 16:29
标签:activity run 线程 bdd web poll return list 数据 以前爬的数据量都有点少了,所以现在写个爬全站数据爬虫来,用redis进行URL的去重处理,采用mysql储存清洗过后房产数据,采用线程池来进行调度,进行多线程爬取 下面是房天下所有地区二手房和新房的URL,为后续爬取提供起始URL: 爬取房天下整个网站房产数据。。。 标签:activity run 线程 bdd web poll return list 数据 原文地址:http://www.cnblogs.com/Huangsh2017Come-on/p/7750417.html 1 import requests
2 from lxml import etree
3
4
5 class Ftx_newhouse_Secondhandhouse(object):
6
7 headers = {
8 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘,
9 ‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘
10
11 }
12
13 def __init__(self):
14 self.url = ‘http://newhouse.fang.com/house/s/‘
15 self.s = requests.session()
16
17
18 def Newhouse_ftx(self):
19 try:
20 response = self.s.post(self.url,headers=self.headers,verify=False)
21 except Exception as e:
22 print(‘error:‘,e)
23 response.encoding = ‘gb2312‘
24 urls = etree.HTML(response.text)
25 xf_adress = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/text()|‘
26 ‘//div[@class="city20141104"]/div[4]/a/text()|‘
27 ‘//div[@class="city20141104"]/div[5]/a/text()‘
28 )
29 xf_url = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/@href|‘
30 ‘//div[@class="city20141104"]/div[4]/a/@href|‘
31 ‘//div[@class="city20141104"]/div[5]/a/@href‘
32 )
33
34 return (dict(zip(xf_adress,xf_url)))
35
36 def Secondhandhouse_ftx(self):
37 self.url = ‘http://esf.sh.fang.com/newsecond/esfcities.aspx‘
38 try:
39 html = requests.get(self.url,headers=self.headers,timeout=4)
40 except Exception as e:
41 print(‘error:‘,e)
42 html.encoding = ‘gb2312‘
43 Secondhandhouse_urls = etree.HTML(html.text)
44 xf_url = Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/text()‘)
45 xf_adress = Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/@href‘)
46 dictx = dict(zip(xf_url,xf_adress))
47 return dictx
下面是爬取房产数据代码: 1 import requests,redis,pymysql
2 from mywed.fangtianxia.url import Ftx_newhouse_Secondhandhouse
3 from lxml import etree
4 from concurrent.futures import ThreadPoolExecutor
5 import re,os,time
6 from mywed.fangtianxia.logs import log_run
7
8 Secondhandhouse_urls_set = {‘http://esf.hbjs.fang.com‘}
9 dr = Ftx_newhouse_Secondhandhouse()
10 w = dr.Secondhandhouse_ftx()
11 for i in w.values():
12 Secondhandhouse_urls_set.add(i)
13 print(Secondhandhouse_urls_set)
14
15
16
17 class Secondhandhouse(object):
18
19 headers = {
20 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘,
21 ‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘
22
23 }
24
25 def get_newhouse_data(self,url):
26
27 for num in range(102):
28 second_url = url + ‘/house/i3‘ + str(num)
29 try:
30 while True:
31 reponse = requests.get(url,headers=self.headers,timeout=3)
32 reponse.encoding = ‘gbk‘
33 #print(reponse.text)
34 if reponse.status_code ==200:
35 break
36 else:
37 print(‘restart donwing ......‘)
38 except Exception as e:
39 log_run.File_enter_error(e)
40 select = etree.HTML(str(reponse.text))
41
42 if not len(select.xpath(‘//a[@id="PageControl1_hlk_next"]/text()‘)):
43 break
44 else:
45 content_list = select.xpath(‘//dd[@class="info rel floatr"]‘)
46 #print(content_list)
47
48 for i in content_list:
49 title = i.xpath(‘./p[1]/a/@title‘)
50 content = i.xpath(‘./p[2]/text()‘)
51 name = i.xpath(‘./p[3]/a/span/text()‘)
52 adress = i.xpath(‘./p[3]/span/text()‘)
53 try:
54 size_list = select.xpath(‘//div[@class="area alignR"]‘)
55 size = [ii.xpath(‘./p/text()‘) for ii in size_list]
56 average_price_list = select.xpath(‘//p[@class="danjia alignR mt5"]‘)
57 average_price = [‘/‘.join(iii.xpath(‘./text()‘)) for iii in average_price_list]
58 sum_price_list = select.xpath(‘//p[@class="mt5 alignR"]‘)
59 sum_price = [‘‘.join(iiii.xpath(‘./span/text()‘)) for iiii in sum_price_list]
60 except Exception as e:
61 log_run.File_enter_error(e)
62 print(title)
63
64
65 if __name__ =="__main__":
66 t = Secondhandhouse()
67 t.get_newhouse_data(‘http://esf.fang.com/house/i33/‘)
68 #s = t.get_newhouse_data
69 #pool = ThreadPoolExecutor(30)
70 #f = pool.map(s,Secondhandhouse_urls_set)
上一篇:HTTP协议