用 Python 获取百度搜索结果链接

2021-02-10 01:19

阅读：813

标签：des %s coding from turn url NPU str 博客

前言

近期有许多项目需要这个功能，由于Python实现起来比较简单就这么做了，代码贴下来觉得好点个赞吧~

代码

# coding: utf-8
import os
import time
import requests
import urllib.parse
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from fake_useragent import UserAgent
from multiprocessing.pool import ThreadPool

LOCATIONS = {}
GLOBAL_THREAD = 500
GLOBAL_TIMEOUT = 50


def get_links(keyword, generator, pages):
    links = []

    for page in range(int(pages.split("-")[0]), int(pages.split("-")[1]) + 1):
        for genera in range(int(generator.split("-")[0]), int(generator.split("-")[1]) + 1):
            links.append(
                "http://www.baidu.com.cn/s?wd=" + urllib.parse.quote(keyword + str(genera)) + "&pn=" + str(page * 10))

    return links


def get_page(url):
    headers = {"user-agent": UserAgent().chrome}
    req = requests.get(url, headers=headers)
    req.encoding = "utf-8"
    soup = BeautifulSoup(req.text, "lxml")

    for link in soup.select("div.result > h3.t > a"):
        req = requests.get(link.get("href"), headers=headers, allow_redirects=False)
        if "=" in req.headers["location"]:
            root = urlparse(req.headers["location"]).netloc
            LOCATIONS[root] = req.headers["location"]


def baidu_search():
    try:
        os.system("cls")
        print("-" * 56 + "\n")
        print("| BaiduSearch Engine By 美图博客[https://www.meitubk.com/] |\n")
        print("-" * 56 + "\n")

        keyword = input("Keyword: ")
        generator = input("Generator(1-10): ")
        pages = input("Pages(0-10): ")

        start = time.time()
        pool = ThreadPool(processes=GLOBAL_THREAD)
        pool.map(get_page, get_links(keyword, generator, pages))
        pool.close()
        pool.join()
        end = time.time()

        path = r"D:\Desktop\result.txt"
        save_result(path)
        print("\nSava in %s" % path)
        print("Result count: %d" % len(LOCATIONS.values()))
        print("Running time: %ds" % (end - start))
    except:
        print("\nInput Error!")
        exit(0)


def save_result(path):
    with open(path, "w") as file:
        for url in list(LOCATIONS.values()):
            file.write(url + "\n")


baidu_search()

使用

技术图片

用 Python 获取百度搜索结果链接

标签：des %s coding from turn url NPU str 博客

原文地址：https://www.cnblogs.com/meitubk/p/12746271.html

上一篇：Python基础04-分支及循环

下一篇：Java基础005 --- 安全管理器、可变参数等

文章来自：搜素材网的编程语言模块，转载请注明文章出处。
文章标题：用 Python 获取百度搜索结果链接
文章链接：http://soscw.com/index.php/essay/53349.html

亲，登录后才可以留言！

用 Python 获取百度搜索结果链接

前言

代码

使用

评论

热门文章

推荐文章

最新文章

置顶文章