Ajax数据爬取
2021-04-02 11:26
标签:交互式 request 开发技术 spl size 情况 cross kmp enc 1. 爬取微博页面Ajax数据 2. Ajax爬取头条街拍图片 Ajax数据爬取 标签:交互式 request 开发技术 spl size 情况 cross kmp enc 原文地址:https://www.cnblogs.com/Caiyundo/p/12554341.html Ajax 即“Asynchronous Javascript And XML”(异步 JavaScript 和 XML),是指一种创建交互式、快速动态网页应用的网页开发技术,无需重新加载整个网页的情况下,能够更新部分网页的技术。
通过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新。
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import json, pymongo
def get_ajax_page(page):
headers = {
‘Host‘: ‘weibo.com‘,
‘Referer‘: ‘https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36‘,
‘X-Requested-With‘: ‘XMLHttpRequest‘,
‘Cookie‘: ‘SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567‘
}
url = ‘https://weibo.com/aj/v6/comment/big‘
params = {
‘ajwvr‘: ‘6‘,
‘id‘: ‘4483557667874538‘,
‘root_comment_max_id_type‘: ‘0‘,
‘page‘: page,
}
try:
response = requests.get(url=url, headers=headers, params=params)
if response.status_code == 200:
return response.json()
# print(type(response.json()), response.json())
except requests.ConnectionError as e:
print(‘error‘, e.args)
def parse_page(js):
data = js.get(‘data‘)
html = data.get(‘html‘)
doc = pq(html)
items = doc(‘div.list_con‘).items()
for item in items:
msg = {}
msg[‘name‘] = item(‘.WB_text‘).text().split(‘:‘)[0]
msg[‘content‘] = item(‘.WB_text‘).text().split(‘:‘)[1]
msg[‘datetime‘] = item(‘div.WB_from.S_txt2‘).text()
yield msg
def collection_mongo(host=‘localhost‘, port=27017):
client = pymongo.MongoClient(host=host, port=port)
return client
def save_mongo(client ,data):
db = client.weibo
collection = db.weibo
if collection.insert(data):
print(‘Save to mongo‘)
def search_mongo(client):
db = client.weibo
collection = db.weibo
result = collection.find()
return result
def main():
for i in range(1, 11):
js = get_ajax_page(str(i))
results = parse_page(js)
for result in results:
client = collection_mongo(‘10.0.0.100‘)
save_mongo(client, result)
if __name__ == ‘__main__‘:
# main()
client = collection_mongo(‘10.0.0.100‘)
data = search_mongo(client)
for item in data:
print(item)
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import json, pymongo
def get_ajax_page(page):
headers = {
‘Host‘: ‘weibo.com‘,
‘Referer‘: ‘https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36‘,
‘X-Requested-With‘: ‘XMLHttpRequest‘,
‘Cookie‘: ‘SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567‘
}
url = ‘https://weibo.com/aj/v6/comment/big‘
params = {
‘ajwvr‘: ‘6‘,
‘id‘: ‘4483557667874538‘,
‘root_comment_max_id_type‘: ‘0‘,
‘page‘: page,
}
try:
response = requests.get(url=url, headers=headers, params=params)
if response.status_code == 200:
return response.json()
# print(type(response.json()), response.json())
except requests.ConnectionError as e:
print(‘error‘, e.args)
def parse_page(js):
data = js.get(‘data‘)
html = data.get(‘html‘)
doc = pq(html)
items = doc(‘div.list_con‘).items()
for item in items:
msg = {}
msg[‘name‘] = item(‘.WB_text‘).text().split(‘:‘)[0]
msg[‘content‘] = item(‘.WB_text‘).text().split(‘:‘)[1]
msg[‘datetime‘] = item(‘div.WB_from.S_txt2‘).text()
yield msg
def collection_mongo(host=‘localhost‘, port=27017):
client = pymongo.MongoClient(host=host, port=port)
return client
def save_mongo(client ,data):
db = client.weibo
collection = db.weibo
if collection.insert(data):
print(‘Save to mongo‘)
def search_mongo(client):
db = client.weibo
collection = db.weibo
result = collection.find()
return result
def main():
for i in range(1, 11):
js = get_ajax_page(str(i))
results = parse_page(js)
for result in results:
client = collection_mongo(‘10.0.0.100‘)
save_mongo(client, result)
if __name__ == ‘__main__‘:
# main()
client = collection_mongo(‘10.0.0.100‘)
data = search_mongo(client)
for item in data:
print(item)