python学习——xpath
2020-12-13 14:10
标签:设置 释放 无锡 tin encoding 种类 while leak 网络数 回顾
xpath使用(【重点】xpath表达式)
tree.xpath(‘xpath表达式‘)
tree.xpath(‘xpath表达式‘)
xpath表达式
案例:获取58二手房相关房源信息
import requests
from lxml import etree
url = ‘https://bj.58.com/beijingzhoubian/ershoufang/?PGTID=0d30000c-0000-1175-8e33-a6e941f8aff5&ClickID=1‘
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER‘
}
# 获取源码数据
page_text = requests.get(url=url,headers=headers).text
# 实例化etree对象
tree = etree.HTML(page_text)
# 调用xpath方法,后去li标签列表
li_list = tree.xpath(‘//ul[@class="house-list-wrap"]/li‘)
fp = open(‘58.csv‘,‘w‘,encoding=‘utf-8‘)
#遍历列表
for li in li_list:
# .开头的意思:进行局部页面解析;./开头表示从li标签开始解析
title = li.xpath(‘./div[2]/h2/a/text()‘)[0]
price = li.xpath(‘./div[3]//text()‘)
#将价格的三个列表拼接为字符串
price = ‘‘.join(price)
fp.write(title+‘:‘+price+‘\n‘)
fp.close()
print(‘over‘)
over
案例:获取图片
import requests
from lxml import etree
import os
import urllib
url = ‘http://pic.netbian.com/4kmeinv/‘
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER‘
}
response = requests.get(url=url,headers=headers)
#response.encoding = ‘utf-8‘
if not os.path.exists(‘./imgs‘):
os.mkdir(‘./imgs‘)
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath(‘//dic[@class="slist"]/ul/li‘) # //dic[@class="slist"]//li
for li in li_list:
img_name = li.xpath(‘./a/b/text()‘)[0]
# 处理中文乱码
img_name = img_name.encode(‘iso-8859-1‘).decode(‘gbk‘)
img_url = ‘http://pic.netbian.com‘ + li.xpath(‘./a/img/@src‘)[0]
img_path = ‘./imgs/‘ + img_name + ‘.jpg‘
urllib.request.urlretrieve(url=img_url,filename=img_path)
print(img_path,‘下载成功‘)
案例:煎蛋网中图片数据:http://jandan.net/ooxx
import requests
from lxml import etree
import base64
import urllib
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER‘
}
url = ‘http://jandan.net/ooxx‘
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
img_hash_list = tree.xpath(‘//span[@class="img-hash"]/text()‘)
for img_hash in img_hash_list:
img_url = ‘http:‘ + base64.b64decode(img_hash).decode()
img_name = img_url.split(‘/‘)[-1]
urllib.request.urlretrieve(url=img_url,filename=img_name)
爬取站长素材中的简历模板
import requests
from lxml import etree
import random
headers = {
‘Connection‘:‘close‘,# 当请求成功后,马上断开该次请求(及时释放请求池中的资源))
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER‘
}
url = ‘http://sc.chinaz.com/jianli/free-%d.html‘
for page in range(1,4):
if page == 1:
new_url = ‘http://sc.chinaz.com/jianli/free.html‘
else:
new_url = format(url%page)
response = requests.get(url=new_url,headers=headers)
response.encoding = ‘utf-8‘
page_text = response.text
tree = etree.HTML(page_text)
div_list = tree.xpath(‘//div[@id="container"]/div‘)
for div in div_list:
detail_url = div.xpath(‘./a/@href‘)[0]
name = div.xpath(‘./a/img/@alt‘)[0]
detail_page = requests.get(url=detail_url,headers=headers).text
tree = etree.HTML(detail_page)
download_list = tree.xpath(‘//div[@class="clearfix mt20 downlist"]/ul/li/a/@href‘)
download_url = random.choice(download_list)
data = requests.get(url=download_url,headers=headers).content
file_name = name + ‘.rar‘
with open(file_name,‘wb‘) as fp:
fp.write(data)
print(file_name,‘下载成功‘)
机械电子工程师简历模板.rar 下载成功
设计师英文简历模板下载.rar 下载成功
化妆师个人简历范文.rar 下载成功
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
E:\Anaconda3\lib\socket.py in readinto(self, b)
588 try:
--> 589 return self._sock.recv_into(b)
590 except timeout:
OSError: [WinError 10051] 向一个无法连接的网络尝试了一个套接字操作。
During handling of the above exception, another exception occurred:
KeyboardInterrupt Traceback (most recent call last)