python爬虫进阶
2021-03-07 23:28
                         标签:sel   urllib   用户   path   mpi   web   trident   start   xpath                  python爬虫进阶 标签:sel   urllib   用户   path   mpi   web   trident   start   xpath    原文地址:https://www.cnblogs.com/toooof/p/14254086.html获取豆瓣https://movie.douban.com/top250的,第一页前25个电影名字
我的答案:
import requests
from bs4 import BeautifulSoup
head={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"}
res=requests.get("https://movie.douban.com/top250",headers=head)
soup=BeautifulSoup(res.content,"html.parser")
for i in range(1,26):
    get=soup.select("#content > div > div.article > ol > li:nth-child(%s) > div > div.info > div.hd > a > span:nth-child(1)"%i)
    print(get[0].string)
爬取https://movie.douban.com/top250,250部电影的名字。
我的答案:
import requests
from bs4 import BeautifulSoup
head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"}
for j in range(0,10):
      res = requests.get("https://movie.douban.com/top250?start="+"%s"%(25*j), headers=head)
      soup=BeautifulSoup(res.content,"html.parser")
      for i in range(1,26):
            get=soup.select("#content > div > div.article > ol > li:nth-child(%s) > div > div.info > div.hd > a > span:nth-child(1)"%i)
            print(get[0].string)
将http://www.netbian.com/s/huyan/index.htm
中的所有图片爬取到本地文件中,并以 第1张.jpg  第2张.jpg……保存。
我的答案:
import requests
from bs4 import BeautifulSoup
head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"}
res = requests.get("http://www.netbian.com/s/huyan/index.htm", headers=head)
soup = BeautifulSoup(res.text, "html.parser")
for i in range(4,20):
      get=soup.select("#main > div.list > ul > li:nth-child(%s) > a > img "%i)
      datel=str(get)
      datelist=datel.split(‘‘‘\"‘‘‘)
      print(datelist[3])
      URL=datelist[3]
      res1 = requests.get(URL, headers=head)
      date=open("D:\\Project\\%s.jpg"%i,"wb+")
      date.write(res1.content)
      date.close()
网址:https://www.kugou.com/yy/rank/home/1-8888.html
爬取第一页的歌曲名和歌手信息
我的答案:
import requests
from lxml import etree
head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"}
res=requests.get("https://www.kugou.com/yy/rank/home/1-8888.html",headers=head)
sele=etree.HTML(res.text)
for i in range(1,23):
    temp=sele.xpath(‘//*[@id="rankWrap"]/div[2]/ul/li[%s]/a/text()‘%i)
    print(temp)
使用正则表达式,匹配多个"zz任意数字",并输出显示
string="asdasdzz234234adas,asdasdzz2348weqesad,zz657878asd"
我的答案:
import re
line="asdasdzz234234adas,asdasdzz2348weqesad,zz657878asd"
pat=re.compile(r‘zz\d+‘)
date=pat.findall(line)
print(date)
使用用户代理池来
网址:https://www.kugou.com/yy/rank/home/1-8888.html
爬取TOP500的500首歌曲名和歌手信息
我的答案:
import requests
from lxml import etree
import random
uapools=[
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
]
def ua():
    thisua=random.choice(uapools)
    head={"User-Agent":thisua}
    return head
head=ua()
for j in range(1,23):
    res=requests.get("https://www.kugou.com/yy/rank/home/%s-8888.html"%j,headers=head)
    sele=etree.HTML(res.text)
    for i in range(1,23):
        temp=sele.xpath(‘//*[@id="rankWrap"]/div[2]/ul/li[%s]/a/text()‘%i)
        print(temp)
for n in range(1,17):
    res1=requests.get("https://www.kugou.com/yy/rank/home/23-8888.html",headers=head)
    sele = etree.HTML(res1.text)
    temp1=sele.xpath(‘//*[@id="rankWrap"]/div[2]/ul/li[%s]/a/text()‘%n)
    print(temp1)
聊天格式如下,并把{br}换成回车
我:小猫
小K:崔燚
我:hello
小K:{face:14}Hi~
我:讲个笑话
小K:★ 关于国际理论{br}一个企业人士登机后发现他很幸运的坐在一个美女旁边。彼此交换简短的寒喧之后,他注意到她正在看一份性学统计的手册,于是他问她那本书,她答道:{br}
我:
我的答案:
import requests
import urllib.request
import json
shuru=input("我:")
while(shuru!=0):
    key=urllib.request.quote(shuru)
    res=requests.get("http://api.qingyunke.com/api.php?key=free&appid=0&msg="+key)
    ddd=json.loads(res.text)
    s = ddd["content"]
    s_replace = s.replace(‘{br}‘, "\n")
    print("小K:",s_replace)
    shuru=input("我:")
 
上一篇:java中的事务,四大特性,并发造成的问题,隔离级别
下一篇:Java局部变量