1、 准备工作
分析51job招聘信息页面,如图所示,红框标记的是页码,将URL分割为两个部分,爬取多个页面是需要修改页码信息
分析请求响应信息,请求时需要模拟浏览器请求信息,如下图,包括header信息和cookie信息;响应结果在页面的javascript标签中,需要通过正则表达式解析响应结果。
2、 用到的Python库
requests:模拟浏览器请求
re:正则表达式
json:字符串转json
pandas:excel操作
3、代码
import json
import re
import time
import requests
import pandasas pd
'''
pre_url: url前缀suf_url: url后缀headers: 请求头cookies: Cookie
page_num: 爬取页数'''
def get_data(pre_url, suf_url, headers, cookies, page_num):
for iin range(1, page_num):
print("爬取第" +str(i) +"页数据")
url = pre_url +str(i) + suf_url
web = requests.get(url, headers=headers, cookies=cookies)
web.encoding ='gbk'
? ? ? ? print(web.text)
r = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', web.text, re.S)
string =''.join(r)
info_dict = json.loads(string)
job_list = info_dict['engine_jds']
JobName = []
Providesalary = []
Workarea = []
Attribute = []
Companyname = []
Companysize = []
Companytype = []
Jobwelf = []
Companyind = []
Suedate = []
for objin job_list:
JobName.append(obj['job_name'])
Providesalary.append(obj['providesalary_text'])
Workarea.append(obj['workarea_text'])
Attribute.append(' '.join(obj['attribute_text'][1:]))
Companyname.append(obj['company_name'])
Companysize.append(obj['companysize_text'])
Companytype.append(obj['companytype_text'])
Jobwelf.append(obj['jobwelf'])
Companyind.append(obj['companyind_text'])
Suedate.append(obj['issuedate'])
data = pd.DataFrame()
data["工作名称"] = JobName
data["工资待遇"] = Providesalary
data["工作地点"] = Workarea
data["职位要求"] = Attribute
data["公司名称"] = Companyname
data["公司规模"] = Companysize
data["公司类别"] = Companytype
data["公司福利"] = Jobwelf
data["主营业务"] = Companyind
data["发布日期"] = Suedate
print(data)
try:
data.to_csv("51Job乌鲁木齐招聘信息.csv", mode="a+", header=None, index=None, encoding="utf-8")
except:
print("跳转网页,无数据")
time.sleep(1)
# Press the green button in the gutter to run the script.
if __name__ =='__main__':
pre_url ="https://search.51job.com/list/310200,000000,0000,00,9,99,+,2,"
? ? suf_url =".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
? ? headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
? ? ? ? 'Accept-Encoding':'gzip, deflate, br',
? ? ? ? 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
? ? ? ? 'Cache-Control':'max-age=0',
? ? ? ? 'Connection':'close',
? ? ? ? 'Referer':'https://search.51job.com/',
? ? ? ? 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
? ? }
cookies = {
"Cookie":"_uab_collina=164515157588672382024854; guid=ffafb018452895c75b5ff63cd2fb9563; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; slife=lastvisit%3D310200%26%7C%26; privacy=1646033921; search=jobarea%7E%60310200%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA-java%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; acw_tc=76b20fe516460406333445179e1863f701fac22d0d83677b10737d6e54a7ab; acw_sc__v2=621c9639e08991e176e0cadd6a8c5f5ea4dabb36; ssxmod_itna=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnmx0v+xPGzDAxn40iDtrO59hfuDq1YGAPrF3YpSa2tX74aRDb6G4W2D3DU4i8DCL2F4WDemtD5xGoDPxDeDA7KiTDY4DdXxYPG0DiKDpx0kG25D7ZF41lKDTPYDRgaGDQyk9gPmx407DiHq920kD75pDlpxIRYD018f1Av1GRG=qlDDUmR60n2bMbb5xqi36m9Gq40OD0FGXxibG6g6Rav14w+e6QxPDaDPKlbq3iDowDrP=QIxmni5bGiQtYxTmlGx=WKKYmrKDDp4Z4PWGD4D==; ssxmod_itna2=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnDnKd2qDsKoDLGYhnav7bAi3Fw8MYYm7=w3ifeEjpMIeq8=EdxnRI=ayekyb=8lFkEnXmpQKjjLW=z/=ORRGLUKMRowcU3gBbUOXpQBUZKAKDq33ZUSi4OMi47CE5IjIhQf2M7l03ulie7eiK7CYP3b0k3lfr2CXFoLwZYQpaNWhk57XKYh9hK0G5i7j187j9tQ61U+eZtl3o=3n9lCI88kow8U=4a+0KyOBx0fGD5BOG1KZfNVSxC69Kwz3DkQM79FxD=3zinLzb=QZ16Kjmhx0YPw3ONv1=5Avpa6phdYz6iPvLB3=1pU1Q56d5FwG05=pEhWbiR5TKRQxYW23FUvPzNAPb9Q=8/=0YLjBUa60ezpEErjiTop2DhW==Qtb0bgnQEuv+nFz6r2uS=e9CbT2TUzEWIAHguA82m/=4=RTKnSc2I9uR4uffo9WXZoTv+dFDG2Sr3SG4IRt4FA0htBx4Y2sqQEYl=R2HbxDFqD+oLNQG5D4PEDqBpKnhn0G5AmKAdCEjryxxD==="
? ? }
get_data(pre_url, suf_url, headers, cookies, 51)