当前位置: 首页>大数据>正文

Python爬取51Job招聘信息

1、 准备工作

分析51job招聘信息页面,如图所示,红框标记的是页码,将URL分割为两个部分,爬取多个页面是需要修改页码信息

Python爬取51Job招聘信息,第1张

分析请求响应信息,请求时需要模拟浏览器请求信息,如下图,包括header信息和cookie信息;响应结果在页面的javascript标签中,需要通过正则表达式解析响应结果。

Python爬取51Job招聘信息,第2张
Python爬取51Job招聘信息,第3张

2、 用到的Python库

requests:模拟浏览器请求

re:正则表达式

json:字符串转json

pandas:excel操作

3、代码

import json

import re

import time

import requests

import pandasas pd

'''

pre_url: url前缀suf_url: url后缀headers: 请求头cookies: Cookie

page_num: 爬取页数'''

def get_data(pre_url, suf_url, headers, cookies, page_num):

for iin range(1, page_num):

print("爬取第" +str(i) +"页数据")

url = pre_url +str(i) + suf_url

web = requests.get(url, headers=headers, cookies=cookies)

web.encoding ='gbk'

? ? ? ? print(web.text)

r = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', web.text, re.S)

string =''.join(r)

info_dict = json.loads(string)

job_list = info_dict['engine_jds']

JobName = []

Providesalary = []

Workarea = []

Attribute = []

Companyname = []

Companysize = []

Companytype = []

Jobwelf = []

Companyind = []

Suedate = []

for objin job_list:

JobName.append(obj['job_name'])

Providesalary.append(obj['providesalary_text'])

Workarea.append(obj['workarea_text'])

Attribute.append(' '.join(obj['attribute_text'][1:]))

Companyname.append(obj['company_name'])

Companysize.append(obj['companysize_text'])

Companytype.append(obj['companytype_text'])

Jobwelf.append(obj['jobwelf'])

Companyind.append(obj['companyind_text'])

Suedate.append(obj['issuedate'])

data = pd.DataFrame()

data["工作名称"] = JobName

data["工资待遇"] = Providesalary

data["工作地点"] = Workarea

data["职位要求"] = Attribute

data["公司名称"] = Companyname

data["公司规模"] = Companysize

data["公司类别"] = Companytype

data["公司福利"] = Jobwelf

data["主营业务"] = Companyind

data["发布日期"] = Suedate

print(data)

try:

data.to_csv("51Job乌鲁木齐招聘信息.csv", mode="a+", header=None, index=None, encoding="utf-8")

except:

print("跳转网页,无数据")

time.sleep(1)

# Press the green button in the gutter to run the script.

if __name__ =='__main__':

pre_url ="https://search.51job.com/list/310200,000000,0000,00,9,99,+,2,"

? ? suf_url =".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="

? ? headers = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

? ? ? ? 'Accept-Encoding':'gzip, deflate, br',

? ? ? ? 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',

? ? ? ? 'Cache-Control':'max-age=0',

? ? ? ? 'Connection':'close',

? ? ? ? 'Referer':'https://search.51job.com/',

? ? ? ? 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'

? ? }

cookies = {

"Cookie":"_uab_collina=164515157588672382024854; guid=ffafb018452895c75b5ff63cd2fb9563; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; slife=lastvisit%3D310200%26%7C%26; privacy=1646033921; search=jobarea%7E%60310200%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA-java%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; acw_tc=76b20fe516460406333445179e1863f701fac22d0d83677b10737d6e54a7ab; acw_sc__v2=621c9639e08991e176e0cadd6a8c5f5ea4dabb36; ssxmod_itna=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnmx0v+xPGzDAxn40iDtrO59hfuDq1YGAPrF3YpSa2tX74aRDb6G4W2D3DU4i8DCL2F4WDemtD5xGoDPxDeDA7KiTDY4DdXxYPG0DiKDpx0kG25D7ZF41lKDTPYDRgaGDQyk9gPmx407DiHq920kD75pDlpxIRYD018f1Av1GRG=qlDDUmR60n2bMbb5xqi36m9Gq40OD0FGXxibG6g6Rav14w+e6QxPDaDPKlbq3iDowDrP=QIxmni5bGiQtYxTmlGx=WKKYmrKDDp4Z4PWGD4D==; ssxmod_itna2=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnDnKd2qDsKoDLGYhnav7bAi3Fw8MYYm7=w3ifeEjpMIeq8=EdxnRI=ayekyb=8lFkEnXmpQKjjLW=z/=ORRGLUKMRowcU3gBbUOXpQBUZKAKDq33ZUSi4OMi47CE5IjIhQf2M7l03ulie7eiK7CYP3b0k3lfr2CXFoLwZYQpaNWhk57XKYh9hK0G5i7j187j9tQ61U+eZtl3o=3n9lCI88kow8U=4a+0KyOBx0fGD5BOG1KZfNVSxC69Kwz3DkQM79FxD=3zinLzb=QZ16Kjmhx0YPw3ONv1=5Avpa6phdYz6iPvLB3=1pU1Q56d5FwG05=pEhWbiR5TKRQxYW23FUvPzNAPb9Q=8/=0YLjBUa60ezpEErjiTop2DhW==Qtb0bgnQEuv+nFz6r2uS=e9CbT2TUzEWIAHguA82m/=4=RTKnSc2I9uR4uffo9WXZoTv+dFDG2Sr3SG4IRt4FA0htBx4Y2sqQEYl=R2HbxDFqD+oLNQG5D4PEDqBpKnhn0G5AmKAdCEjryxxD==="

? ? }

get_data(pre_url, suf_url, headers, cookies, 51)

4、爬取结果

Python爬取51Job招聘信息,第4张

https://www.xamrdz.com/bigdata/7wj1994606.html

相关文章: