python+selenium爬取关键字搜索google图片
2020-12-13 05:23
标签:-- roc meta 结果 div arch key open nta python+selenium爬取关键字搜索google图片 标签:-- roc meta 结果 div arch key open nta 原文地址:https://www.cnblogs.com/buyizhiyou/p/11140128.html 1 # -*- coding: utf-8 -*-
2
3 import json
4 import os
5 import time
6 from multiprocessing import Pool
7 import multiprocessing
8 import requests
9 from selenium import webdriver
10
11
12 def get_image_links(keyword, num_requested = 1000):
13 """get image links with selenium
14 """
15 number_of_scrolls = int(num_requested/400) + 1
16 img_urls = set()#设置为集合,自动去除重复链接
17 chrome_options = webdriver.ChromeOptions()
18 # chrome_options.add_argument(‘--headless‘)#设置无头浏览器
19 # chrome_options.add_argument(‘user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"‘)
20 # chrome_options.add_argument("lang=en_US")#设置语言
21 # prefs = {"profile.managed_default_content_settings.images":2}
22 # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片
23 driver = webdriver.Chrome(chrome_options=chrome_options)
24 driver.maximize_window()
25 search_query = keyword
26 url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"
27 driver.get(url)
28 for _ in range(number_of_scrolls):
29 for i in range(5):
30 # multiple scrolls needed to show all 400 images
31 driver.execute_script("window.scrollBy(0, 100000)")
32 time.sleep(1)
33 time.sleep(5)#等待页面刷新,否则有可能元素不可见
34 try:
35 # driver.find_element_by_xpath("//input[@value=‘Show more results‘]").click()#浏览器的中英文版本不同
36 driver.find_element_by_xpath("//input[@value=‘显示更多结果‘]").click()
37 except Exception as e:
38 print("reach the end of page ")
39 break
40
41 # with open(‘page.html‘,‘w‘) as f:
42 # f.write(driver.page_source)
43 imgs = driver.find_elements_by_xpath(‘//div[contains(@class,"rg_meta")]‘)#模糊定位
44 for i,img in enumerate(imgs):
45 img_url = json.loads(img.get_attribute(‘innerHTML‘))["ou"]
46 img_urls.add(img_url)
47 driver.quit()
48 print("finish getting all image urls!")
49
50 return img_urls
51
52 def download(urls,download_dir):
53 ‘‘‘download images
54 ‘‘‘
55 print("start downloading images!")
56 for url in urls:
57 filename=os.path.join(download_dir,os.path.basename(url))
58 try:
59 r = requests.get(url, stream=True, timeout=60)
60 r.raise_for_status()
61 with open(filename, ‘wb‘) as f:
62 f.write(r.content)
63 except Exception:
64 continue
65 print("finish downloading images!")
66
67 keywords = [‘girl‘,‘boy‘]
68 download_dir = ‘./images/‘
69 download_dirs = []
70 for keyword in keywords:
71 path = os.path.join(download_dir,keyword)
72 download_dirs.append(path)
73 if not os.path.exists(path):
74 os.makedirs(path)
75
76 # for keyword in main_keywords:
77 # image_urls = get_image_links(keyword)
78 # download(image_urls,download_dir)
79
80
81 ###################################
82 # get image links/MultiProcess
83 ###################################
84 img_urls=[]
85 multiprocessing.freeze_support()
86 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
87 for keyword in keywords:
88 img_urls.append(p.apply_async(get_image_links, (keyword,)))
89 #img_urls:[
下一篇:地图API哪个好用
文章标题:python+selenium爬取关键字搜索google图片
文章链接:http://soscw.com/essay/30893.html