win4000-spider

2021-07-04 10:09

阅读：503

import requests, re
from requests.exceptions import RequestExceptionlis = []
head = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64)‘
                         ‘ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘}
def win4000_next_page(url):
    try:
        resp = requests.get(url,headers=head)
        if resp.status_code == 200:
            return resp.text
        else:
            print(‘网站出错‘)
    except RequestException:
        print(‘请求出错‘)
def get_html():
    url = ‘http://www.win4000.com/meinv163726.html‘
    html = win4000_next_page(url)
    count = 0
    while count        count += 1
        res1 = re.findall(r‘url="http://pic1.win4000.com/pic.+/>‘,html)
        res2 = res1[0].lstrip(‘url="‘)
        res = res2.rstrip(‘" />‘)
        page2 = re.findall(‘href="http://www.soscw.com/.+>下一张‘,html)
        page1 = page2[0].lstrip(‘h"‘)
        page3 = page1.lstrip(‘ref="‘)
        page = page3.rstrip(‘">下一张‘)
        lis.append(res)
        html = win4000_next_page(page)
        print(‘下一张%s‘%count)
def download(lis):
    count = 0
    for i in lis:
        count += 1
        res = grab_pic(i)
        with open(r‘C:\pythondm\spider\piclib\%s.jpg‘%count,‘wb‘) as f:##绝对路径(需要修改)
            f.write(res)
        print(‘写入完成%s‘%count)
def grab_pic(url):
    try:
        respson = requests.get(url, headers=head)
        if respson.status_code == 200:
            return respson.content
        else:
            print(‘网站出错‘)
    except RequestException:
        print(‘请求出错‘)
if __name__ == ‘__main__‘:
    get_html()
    download(lis)
    print(‘全部完成‘)

上一篇：Windows Gdi & CDC和HDC的区别与转换

下一篇：9.Spring——基于注解的配置

文章来自：搜素材网的编程语言模块，转载请注明文章出处。
文章标题：win4000-spider
文章链接：http://soscw.com/essay/101692.html

亲，登录后才可以留言！

win4000-spider

评论

热门文章

推荐文章

最新文章

置顶文章