python 批量获取51job，拉钩，智联招聘职位信息

设计是空分类：程序源码发布日期：2020-02-26 11:25

批量获取51job，拉钩，智联招聘职位信息# -*- coding:utf-8 -*- import sys reload(sys) sys.setDEFAULTencoding('utf-8') import os import threading import dateTime from bs4 import BeautifulSoup import requests import re import pyMysqL from download import download import csv class Job_qcwy(threading.Thread): def __init__(self, jobarea): threading.Thread.__init__(self) # self.URL = 'http://search.51job.com/jobsearch/search_result.PHP?fromJs=1&jobarea=' jobarea '&keyword=' keyword '&keywordtype=2&curr_page=' str(pn) '&lang=c&stype=2&postchannel=0000&fromType=1&confirmdate=9' self.city = jobarea self.keywords = ['大数据'] self.headers = { 'Host': 'search.51job.com', 'Upgrade-Insecure-Requests': '1', 'user-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) ApplewebKit/537.36 (Khtml, like GECko) Chrome/35.0.1916.153 Safari/537.36', 'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } self.filepath = '/Users/edz/documents/project/PositionAnalysis/positions.csv' # self.data = { # 'fromJs': 1, # 'jobarea': jobarea, # 'funtype': '', # 'industrytype': '', # 'keyword': keyword, # 'keywordtype': 2, # 'lang': 'c', # 'stype': 2, # 'postchannel': 0000, # 'fromType': 1, # 'confirmdate': 9, # 'curr_page': pn # } self.time = datetime.datetime.now() if self.time.month < 10: self.month = "0" str(self.time.month) else: self.month = str(self.time.month) if self.time.day < 10: self.day = "0" str(self.time.day) else: self.day = str(self.time.day) self.today = self.month "-" self.day self.Session = requests.Session() def run(self): self.conn() f = open(self.filepath, 'a') w = csv.writer(f) try: with self.db.cursor() as cur: for i in range(len(self.keywords)): for j in range(1,2): self.url = 'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=' self.city '&keyword=' self.keywords[i] \ '&keywordtype=2&curr_page=' str(j) '&lang=c&stype=2&postchannel=0000&fromType=1&confirmdate=9' html = download(self.url, self.headers) # html = self.session.get(self.url, headers=self.headers) html.encoding = 'GBK' bsobj = BeautifulSoup(html.text, 'html.parser') table = bsobj.find('div', {'class', 'dw_table'}) all_els = table.findAll('div', {'class', 'el'}) if all_els: print len(all_els) for i in range(1, len(all_els)): today = all_els[i].find('span', {'class', 't5'}).get_text() if today == self.today: # 判断发布的职位是否是当天的 link = all_els[i].find('a')['href'] print link positionId = re.findall(r'\d ', link)[1] s = "SELECT `Position` FROM `51job_position_info` where `PositionId`='" positionId "'" if cur.execute(s) == 0: position, salary, area, Description_and_requirements, companyName, companyNature, companyPersonnel, companyIntroduction = self.get_info(link) publish_time = self.today print position, salary, area, companyName # w.writerow([positionId, position, salary, Description_and_requirements, area, publish_time, companyName, companyNature, companyPersonnel, companyIntroduction]) try: sql = "INSERT INTO `51job_position_info` (`PositionId`, `Position`, `Salary`, `Description`, `Address`, `Publish_time`, `Company`, `CompanyNature`, `CompanyPersonnel`, `CompanyIntroduction` ) " \ "VALUE ('" positionId "','" position "','" salary "','" Description_and_requirements "','" area "','" publish_time "','" companyName "','" companyNature "','" companyPersonnel "','" companyIntroduction "')" cur.execute(sql) self.db.commit() print sql except Exception as e: print e else: print positionId "已存在" else: print all_els finally: f.close() self.db.close() # 数据库设置 def conn(self): self.db = pymysql.connect( host='localhost', user='root', password='123456', db='PositionAnalysis', charset='UTF8mb4', cursorclass=pymysql.cursors.DictCursor ) # 获取职位的详细信息 def get_info(self, link): try: self.headers['Host'] = 'jobs.51job.com' html = download(link, self.headers) # html = self.session.get(link, headers=self.headers) html.encoding = 'gbk' bs = BeautifulSoup(html.text, 'html.parser') thjob = bs.find('div', {'class', 'tHeader tHjob'}) cn = thjob.find('div', {'class', 'cn'}) # 职位 JobName = cn.find('h1').get_text() lname = cn.find('span').get_text() if lname == None: lname = '' # 薪资 salary = cn.find('strong').get_text() if salary == None: salary = '' infos = bs.find('div', {'class', 'tCompany_main'}).find('div', {'class', 'bmsg job_msg inBox'}).get_text().replace('\t', '').replace('\r', '') # sps = bs.findAll('span', {'class', 'sp4'}) # # 经验要求 # experience = sps[0].get_text().replace('\n', '') # # 学历要求 # education = sps[1].get_text().replace('\n', '') if infos != '': if re.findall(r'(.*?)\n', infos)[0] == '': #职位描述和任职要求 try: Description_and_requirements = re.findall(r'(.*?)\n', infos)[1] re.findall(r'(.*?)\n', infos)[2] except: Description_and_requirements = '' else: try: Description_and_requirements = re.findall(r'(.*?)\n', infos)[0] re.findall(r'(.*?)\n', infos)[1] except: Description_and_requirements = '' else: Description_and_requirements = '' a = bs.find('div', {'class', 'bmsg inbox'}) if a != None: ar = a.find('p', {'class', 'fp'}) # 工作地点 area = lname '-' re.findall(r'span>(.*?)<', str(ar))[0].replace('\t', '').decode('utf-8') else: area = lname #公司 companyName = cn.find('p', {'class', 'cname'}).find('a').get_text() if companyName == None: companyName = '' company = cn.find('p', {'class', 'msg ltype'}).get_text().replace('\r', '').replace('\t', '').replace(' ', '') try: company = company.decode('utf-8').encode('utf-8') except: pass if company: if re.findall(r'(.*?)\xc2\xa0', company) != []: q = re.findall(r'(.*?)\xc2\xa0', company)[0] if re.findall(r'\d', q) == []: # 公司性质 companyNature = q if len(re.findall(r'(.*?)\xc2\xa0', company)) > 2: # 公司规模 try: companyPersonnel = re.findall(r'(.*?)\xc2\xa0', company)[4] except: companyPersonnel = '' else: companyPersonnel = '' else: companyNature = '' companyPersonnel = q else: q = re.findall(r'(.*?)\xa0\xa0', company)[0] if re.findall(r'\d', q) == []: companyNature = q if len(re.findall(r'(.*?)\xa0\xa0', company)) > 2: try: companyPersonnel = re.findall(r'(.*?)\xa0\xa0', company)[2] except: companyPersonnel = '' else: companyPersonnel = '' else: companyNature = '' companyPersonnel = q else: companyNature = '' companyPersonnel = '' #公司描述 try: companyIntroduction = bs.find('div', {'class', 'tmsg inbox'}).get_text().replace('\t', '').replace('\n', '') except: companyIntroduction = '' return JobName, salary, area, Description_and_requirements, companyName, companyNature, companyPersonnel, companyIntroduction except Exception as e: print link print e.message if __name__ == '__main__': # filepath = '/Users/edz/Documents/project/PositionAnalysis/positions.csv' # if not os.path.isfile(filepath): # f = open(filepath, 'a') # writer = csv.writer(f) # writer.writerow(['positionId', 'position', 'salary', 'Description_and_requirements', 'area', 'publish_time','companyName', 'companyNature', 'companyPersonnel', 'companyIntroduction']) # f.close() jobarea = ['010000'] threads = [] for i in range(len(jobarea)): t = Job_qcwy(jobarea[i]) threads.append(t) for thread in threads: thread.start() for thread in threads: thread.join()

简介：批量获取51job，拉钩，智联招聘职位信息# -*- coding:ut

来源：搜素材网素材

搜素材网所有素材均为本站用户上传,仅供学习与参考，请勿用于商业用途,如有侵犯您的版权请联系客服服务QQ
本站提供各类html5响应式模板,前端js素材,网站模板,后台模板素材,程序源码素材。
由于技术有限本站不提供安装服务与bug修复，各类源码只提供分享服务，感谢您的理解。
如果对本站有任何意见请点击右侧侧边栏的反馈意见，我们会及时处理。

亲，登录后才可以留言！

相似素材

python 获取新浪网新闻链接

小J

1006 0 1

演示下载收藏
4580 0 6
下载积分：2
内容标签： python获取 51job 智联招聘
分享到：

搜素材网素材除本站原创外均由用户分享，若发现权利被侵害，请联系及时联系我们，我们会在第一时间进行处理。

特别说明：本站所有资源除本站原创外仅供学习与参考，请勿用于商业用途,如有侵犯您的版权请联系客服服务QQ：

python 批量获取51job，拉钩，智联招聘职位信息

简介：批量获取51job，拉钩，智联招聘职位信息# -*- coding:ut

评论

相似素材

python 获取新浪网新闻链接