爬取淘宝 天猫网站数据# -*- coding: utf-8 -*-
#!/usr/bin/env Python
import dateTime
import URLparse
import socket
import scrapy
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.http import Request
import json
import base64
import scrapy
from scrapy.http.headers import Headers
from taobao.items import TaobaoItem
from urllib import quote,unquote
import sys
reload(sys)
sys.setDEFAULTencoding('utf-8')
class MySpider(scrapy.Spider):
name = 'tmall2'
start_urls = ["http://example.com", "http://example.com/foo"]
def __init__(self):
self.headers={
'Host': 'detail.tmall.com',
'user-Agent': 'Mozilla/5.0 (windows NT 10.0; WOW64; rv:44.0) GECko/20100101 Firefox/44.0',
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip, deflate, br',
'Referer':'https://list.tmall.com/search_product.htm?q=iphone',
'Cookie':'hng=CN%7Czh-cn%7CCNY; l=AmFhUQz9l9Bm0s1PIcUbVzUrUSd709Vr; pnm_cku822=213UW5TcyMNYQwiAiwTR3tCf0J%2FQnhEcUpkMmQ%3D%7CUm5OcktzSHFMdkpwTXFEcSc%3D%7CU2xMHDJ%2BH2QJZwBxX39RaFF%2FX3EtTCpGIV8lC10L%7CVGhXd1llXGRfZlthXWdaZlNmUWxOdEpxRXhMeUx0QHhMckh2Qmw6%7CVWldfS0SMg4zBycbJAQqAXRfeB9kNFY0EDtEajxq%7CVmhIGCwSMg8vEycaJAQ6DzQIKBQgHyICPgM2CysXIxwhAT0AOQRSBA%3D%3D%7CV25Tbk5zU2xMcEl1VWtTaUlwJg%3D%3D; cna=c7xUD5TeoxgCARsmEAVdwH4E; cq=ccp%3D1; t=ea7cda7b4dd7d94c574c51a61cd68bf6; uc3=nk2=G4mgLCRZx6no8qfi5g%3D%3D&id2=UonZBtTqYSCQGg%3D%3D&vt3=F8dAscn1mkMKfq3pmos%3D&lg2=W5iHLLyFOGW7aA%3D%3D; lgc=xiaowenjie886; tracknick=xiaowenjie886; _tb_token_=WcXcAjsXNiib; cookie2=3647140634e8134de4621d27d06a6239; OZ_1U_2061=vid=v6cf00b635ac22.0&ctime=1456406710<ime=0; OZ_1Y_2061=erefer=https%3A//list.tmall.com/search_product.htm%3Fq%3D%25CD%25E2%25CC%25D7%25C4%25D0%26click_id%3D%25CD%25E2%25CC%25D7%25C4%25D0%26from%3Dmallfp..pc_1.0_hq%26spm%3D875.7789098.a1z5h.1.1DJapJ&eurl=https%3A//detail.tmall.com/item.htm%3Fspm%3Da220m.1000858.1000725.11.XG2djx%26id%3D525068649325%26skuId%3D3125134725161%26areaId%3D440300%26cat_id%3D50025174%26rn%3D020410dd2019f68eaf3d848b4d14552f%26user_id%3D196993935%26is_b%3D1&etime=1456406710&ctime=1456406710<ime=0&compid=2061',
'Connection':'keep-alive',
'cache-Control':'max-age=0'
}
self.cookies={
'l':'ArGxZLdew/Qq2hKqnZPLZoKK4TdLHyUb',
'cna':'OW9VD5ReU2Acadxw7hJSgV4y',
'cookie2':'1cfecc6ae5749b36804d524b9d0cccb4',
't':'2fd2137e54b753c57bec7b945f504547',
'_tb_token_':'l0ckiPAV9KXX',
'ck1':'',
'uc1':'cookie14=UoWyiPlLPWymJA%3D%3D&existShop=false&cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&cookie21=WqG3DMC9EdFmJgke4t0pDw%3D%3D&tag=3&cookie15=VT5L2FSpMGV7TQ%3D%3D&pas=0',
'uc3':'nk2=G4mgLCRZx6no8qfi5g%3D%3D&id2=UonZBtTqYSCQGg%3D%3D&vt3=F8dAScn1nphE%2FG5b7yQ%3D&lg2=Vq8l%2BKCLz3%2F65A%3D%3D',
'lgc':'xiaowenjie886',
'tracknick':'xiaowenjie886',
'cookie1':'UNaG7hUVmBqzT5U4J5xH8HeBiBsUUL0QGHEE%2BJc503Q%3D',
'unb':'1821174258',
'skt':'116663449cdcca0c',
'_nk_':'xiaowenjie886',
'_l_g_':'Ug%3D%3D',
'cookie17':'UonZBtTqYSCQGg%3D%3D',
'hng':'CN%7Czh-cn%7CCNY',
'login':'true',
'pnm_cku822':'pnm_cku822=213UW5TcyMNYQwiAiwTR3tCf0J%2FQnhEcUpkMmQ%3D%7CUm5OcktzSHFMdkpwTXFEcSc%3D%7CU2xMHDJ%2BH2QJZwBxX39RaFF%2FX3EtTCpGIV8lC10L%7CVGhXd1llXGRfZlthXWdaZlNmUWxOdEpxRXhMeUx0QHhMckh2Qmw6%7CVWldfS0SMg4zBycbJAQqAXRfeB9kNFY0EDtEajxq%7CVmhIGCwSMg8vEycaJAQ6DzQIKBQgHyICPgM2CysXIxwhAT0AOQRSBA%3D%3D%7CV25Tbk5zU2xMcEl1VWtTaUlwJg%3D%3D; expires=Sat, 26 Mar 2016 13:32:50 GMT; path=/; domain=detail.tmall.com'
}
self.url='https://s.taobao.com/search?spm=a21bo.7724922.8452-fline.1.uFDF4G&q=秋季打底衫'
def start_requests(self):
script="""
function main(splash)
assert(splash:go(splash.args.url))
splash:wait(1.0)
return splash:html()
end
"""
yield scrapy.Request(self.url,self.parse_result, Meta={
'splash': {
'args': {'lua_source': script,'url':self.url},
'endpoint': 'execute',
}
})
def parse_result(self, response):
pageCountXpath=response.xpath("//div[@class='pager']/ul/li[2]/text()").extract()
page=(','.join(pageCountXpath))[1:]
pagecount=int(page)
script="""
function main(splash)
assert(splash:go(splash.args.url))
assert(splash:wait(8.5))
return splash:html()
end
"""
for i in range(0,44*pagecount,44):
url2='https://s.taobao.com/search?q=秋季打底衫&s=%d' % i
yield scrapy.Request(url2,self.parse_next,meta={
'splash':{
'args':{'lua_source':script,'url':url2},
'endpoint':'execute',
}
})
def parse_next(self,response):
item = TaobaoItem()
titleALL=response.xpath("//div[@class='item ']/div[2]/div[2]/a/text()").extract()
item['title']=titleALL
shopnameAll =response.xpath("//a[@class='shopname J_MouseEneterLeave J_ShopInfo']/span[2]/text()").extract()
item["shopname"]=shopnameAll
return item
#return item
# sudo service docker restart
来源:搜素材网素材
搜素材网所有素材均为本站用户上传,仅供学习与参考,请勿用于商业用途,如有侵犯您的版权请联系客服服务QQ
本站提供各类html5响应式模板,前端js素材,网站模板,后台模板素材,程序源码素材。
由于技术有限本站不提供安装服务与bug修复,各类源码只提供分享服务,感谢您的理解。
如果对本站有任何意见请点击右侧侧边栏的反馈意见,我们会及时处理。