python爬虫实例
2021-05-07 19:29
标签:https sts man beautiful wait previous loop not sync 1.九寨沟景点 2. python爬虫实例 标签:https sts man beautiful wait previous loop not sync 原文地址:https://www.cnblogs.com/snow-wolf-1/p/13181973.htmlimport asyncio
import requests
from bs4 import BeautifulSoup
base_url = ‘https://www.jiuzhai.com/{0}‘
async def parse_next_html(response):
soup = BeautifulSoup(response.text,features="html.parser")
next = soup.select_one(‘.pager .next a‘)
if next is not None:
print(base_url.format(next.attrs[‘href‘]),"+++++++")
response = await get_response(base_url.format(next.attrs[‘href‘]))
if response is not None:
await parse_next_html(response)
async def parse_previous_html(response):
soup = BeautifulSoup(response.text,features="html.parser")
previous = soup.select_one(‘.pager .previous a‘)
if previous is not None:
print(base_url.format(previous.attrs[‘href‘]),"*********")
response = await get_response(base_url.format(previous.attrs[‘href‘]))
if response is not None:
await parse_previous_html(response)
async def get_response(url):
try:
return requests.get(url)
except Exception as e:
print(e)
return None
async def run_manager(url):
response = await get_response(url)
if response is not None:
await parse_previous_html(response)
await parse_next_html(response)
async def main():
start_url = ‘https://www.jiuzhai.com/news/number-of-tourists/7110-5000-180‘
await asyncio.gather(
run_manager(start_url),
)
if __name__ == ‘__main__‘:
loop = asyncio.get_event_loop()
loop.run_until_complete(main())