基于python的爬虫(一)
2020-12-13 03:26
标签:返回 详情 目标 return flush 写入 sha setting cts 一.爬虫的基本流程: 二.我们来爬一个校花网 并发版: 三.request的基本使用 1.get请求的两种方式: 每次url编码会很麻烦,所以可以在GET内添加参数即可: get请求访问知乎: get请求访问github: 2.post请求 3.response 下载一张图片: 补充: 基于python的爬虫(一) 标签:返回 详情 目标 return flush 写入 sha setting cts 原文地址:https://www.cnblogs.com/sima-3/p/11074946.html# 1、发起请求:
使用http库向目标站点发起请求,即发送一个Request
Request包含:请求头、请求体等
# 2、获取响应内容
如果服务器能正常响应,则会得到一个Response
Response包含:html,json,图片,视频等
# 3、解析内容
解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等
解析json数据:json模块
解析二进制数据:以b的方式写入文件
# 4、保存数据
数据库
文件
import requests
import re
# 爬虫三部曲
# 一 发送请求
def get_page(url):
index_res = requests.get(url)
return index_res.text
# 二 解析数据
# 解析主页
def parse_index(index_page):
detail_urls = re.findall(‘
# pip3 install requests
import requests
import re
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(50)
# 爬虫三部曲
# 一 发送请求
def get_page(url):
print(‘%s GET start ...‘ % url)
index_res = requests.get(url)
return index_res.text
# 二 解析数据
# 解析主页
def parse_index(index_page):
# 拿到主页的返回结果
res = index_page.result()
detail_urls = re.findall(‘
import requests
from urllib.parse import urlencode
# 请求url
base_url = ‘https://www.baidu.com/s?‘ + urlencode({"wd": "美女"})
# 请求头
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0‘
}
# 请求方法 GET
res = requests.get(base_url, headers=headers)
# print(res) 一个response对象 # print(res.text) 整个html文本 # print(res.content) 二进制内容with open(‘meinv.html‘, ‘w‘, encoding=‘utf-8‘) as f:
f.write(res.text)
import requests
# 请求url
base_url = ‘https://www.baidu.com/s?‘
# # 请求头
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0‘
}
#
# 请求方法 GET
res = requests.get(base_url, headers=headers, params={"wd": "黄云"})
with open(‘小云云.html‘, ‘w‘, encoding=‘utf-8‘) as f:
f.write(res.text)
# 访问知乎
# 请求url
zhi_url = ‘https://www.zhihu.com/explore‘
# # 请求头
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0‘
}
# 请求方法 GET
res = requests.get(zhi_url, headers=headers)
with open(‘知乎.html‘, ‘w‘, encoding=‘utf-8‘) as f:
f.write(res.text)
# # 请求头,登录后的主页
url=‘https://github.com/settings/emails‘
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36‘,
‘Cookie‘: ‘has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=U0hueWR2WmcvMEJ3amVCTFpOVm5KUDFob1FQUHBtd1BYK09ENkU0aTBqK1JrYmFiYTd6K3pLb0pSVDV5UzdOU0oxbGluSDR3dmVJYTA3WlVpaHZ2cWJmQTJrVTQzRHVFa1cvT1hrWG1ON1ZMRm1DeEtkQkhDRUVaK2cwUUpRN29UUnlyWnRCODQ3cTRLYWZkcmN5UHdnPT0tLUgxSmxJMUQzWDllblhFT3JMK083Tnc9PQ%3D%3D--92e621b5b1d19cf03e157bf61e02ded6a1a248c6‘
}
# # 请求头,email
headers_2 = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36‘,
‘Cookie‘:‘has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=SE5mdjlBaWtla3B2czNYZFI5UTF6TEhUbERvellXVTZnUVE3d0hjTDBTb3RtZ0UxTXhYSCt4S2h2NXR2c3h2YVNaZUNITHlCOE9GcmhIM2lweVFVellYMExxV3dEK0R1ZU15cUEycmxIRk4yZW1WT2J5c3hFVHZ4Y3ZOaUhBN0ZseWcyTmMwNWxPTEIrMmpnVVpKRUJRPT0tLTdNcFZsOTFidnpxZk05cWVZUmV0MkE9PQ%3D%3D--6064098de4400f5a7ac71cdd3806abd11b2a0134‘
}
# 请求方法 GET
# res = requests.get(url, headers=headers_2)
res = requests.get(url, headers=headers)
with open(‘github.html‘, ‘w‘, encoding=‘utf-8‘) as f:
f.write(res.text)
print(‘1059239165‘ in res.text)
# True
# 第一步 https://github.com/login >>>> 获取tocken
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0‘
}
login_res = requests.get(‘https://github.com/login‘, headers=headers)
#
authenticity_token = re.findall(‘name="authenticity_token".*?value="(.*?)"‘, login_res.text, re.S)[0]
print(
authenticity_token
)
# 第二步拿到cookies
cookies = {}
# 把login_cookies放进cookies字典内
cookies.update(login_res.cookies.get_dict())
print(cookies)
# 第三步 往session发送post请求
# 请求方法 POST
# 请求url
# https://github.com/session POST
# 请求体
form_data = {
"commit": "Sign in",
"utf8": "?",
"authenticity_token": authenticity_token,
"login": "pengsima",
"password": "sa081140510",
"webauthn-support":" supported"
}
# json
# requests.post(‘https://github.com/session‘, headers=headers, json=form_data)
res = requests.post(‘https://github.com/session‘, headers=headers, data=form_data, cookies=cookies)
# print(res.status_code)
with open(‘github.html‘, ‘w‘, encoding=‘utf-8‘) as f:
f.write(res.text)
import requests
baidu = ‘https://www.baidu.com/‘
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0‘
}
res = requests.get(baidu, headers=headers)
# 返回响应状态码
print(res.status_code)
print(res)
# 响应头
print(res.headers)
# 响应文本
print(res.text)
print(res.url)
#
print(res.cookies)
print(res.cookies.get_dict())
print(res.encoding)
# res.encoding = ‘utf-8‘
# print(res.encoding)
print(res.history)
print(res.content)
bo = ‘https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551942493340&di=afa19a1f5a3a4fbdec983baaeb1954f0&imgtype=0&src=http%3A%2F%2Fwww.xnnews.com.cn%2Fwenyu%2Flxsj%2F201611%2FW020161114828261827516.jpg‘
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0‘
}
res = requests.get(bo, headers=headers, stream=True)
with open(‘bo2.jpg‘, ‘wb‘) as f:
for line in res.iter_content():
# f.write(res.content)
f.write(line)
取消重定向(默认为True):
allow_redriects=False