python-spider

2021-02-20 18:17

阅读:483

标签:ant   x64   isp   spl   version   code   range   from   ora   

#无状态请求:啥东西都不给我
#有状态请求:返回一个东西给我(相当于分配一个id给我,浏览器则保存了这个id,第二次请求时不用。。。。)
import requests,lxml,re
from bs4 import BeautifulSoup
while True:
    page=3
    for i in range(page):
        print("正在爬取第{}页".format(i))
        headers={
            User-Agent : Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36,
            Cookie : "track_id=64470618111905792; uuid=1f7711a6-2666-4118-ccba-21d079d62a19; antipas=A324J8H7723967677PA9H49713; cityDomain=sz; clueSourceCode=%2A%2300; user_city_id=17; ganji_uuid=8532394690421830647367; sessionid=4e3b40e1-4fe0-49e5-b013-0a6ababc8547; lg=1; lng_lat=114.00978_22.53774; gps_type=1; close_finance_popup=2020-04-10; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22pcbiaoti%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%2264470618111905792%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%221f7711a6-2666-4118-ccba-21d079d62a19%22%2C%22ca_city%22%3A%22sz%22%2C%22sessionid%22%3A%224e3b40e1-4fe0-49e5-b013-0a6ababc8547%22%7D; preTime=%7B%22last%22%3A1586526224%2C%22this%22%3A1586526193%2C%22pre%22%3A1586526193%7D"
        }#请求身份证:字典形式
        url = https://www.guazi.com/sz/buy/o{}/#bread.format(i)
        resp = requests.get(url,headers=headers)
        #print(resp.txt) #返回网页源代码的文本形式
        html=resp.content.decode()#返回网页源代码的二进制形式
        #服务器反爬机制
        #3.解析网页,提取数据
        soup=BeautifulSoup(html,lxml)
        infos=soup.find(ul,{class:carlist clearfix js-top}).find_all(li)
        with open(rD:\Typora\2020-04-06\guazi.csv,a+,encoding=utf-8) as f:
            for info in infos:
                cars=info.find(h2).get_text()
                cars_a=re.sub(r ,,,cars).split( )
                print(cars_a)
                years=info.find(div,{class:t-i}).get_text()
                year=re.sub(r|,‘‘,years).split(|)
                print(year)
                try:
                    newprice = info.find(div, {class: t-price}).find(p).get_text()
                    oldprice=info.find(div,{class:t-price}).find(em).get_text()
                    f.write({},{},{},{}\n.format(cars_a[0],year[0],newprice,oldprice))
                except AttributeError:
                    continue
    break

 

python-spider

标签:ant   x64   isp   spl   version   code   range   from   ora   

原文地址:https://www.cnblogs.com/LGGL/p/12681163.html


评论


亲,登录后才可以留言!