JobPlus知识库 IT 大数据 文章
智联招聘爬虫

直接上代码吧,爬虫思路写在了注释中

  1. import requests

  2. from lxml import etree

  3. import time

  4. import re

  5. '''

  6.    1.需求分析

  7.        title gsmc gz addr jy xl fuli

  8.        入口地址:https://www.zhaopin.com/

  9.  

  10.    2.源码实现

  11.        所有职位分类标签://div[@class='zp-jobNavigater-pop-list']/a

  12.        职位详细列表:https://sou.zhaopin.com/?jl=489&kw=Java%E5%BC%80%E5%8F%91&kt=3

  13.    3.代码实现

  14.                

  15. '''

  16. # 1.获取职位标签

  17. def get_job_tag(url):

  18. headers = {

  19. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

  20. }

  21. response = requests.get(starturl, headers=headers).text

  22. # print(response)

  23. # 解析源码

  24. HTML = etree.HTML(response)

  25. # 获取职位分类标签

  26. job_tag = HTML.xpath("//div[@class='zp-jobNavigater-pop-list']/a/text()")

  27. return job_tag

  28. # 获取职位信息

  29. def get_job_info(url, start, kw):

  30. headers = {

  31. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

  32. }

  33. info_html = requests.get(infourl.format(start, kw), headers=headers).json()

  34. job_dict = {}

  35. for i in info_html['data']['results']:

  36. job_dict['city'] = i['city']['items'][0]['name']

  37. job_dict['company_name'] = i['company']['name']

  38. job_dict['company_size'] = i['company']['size']['name']

  39. job_dict['companyType'] = i['company']['type']['name']

  40. job_dict['eduLevel'] = i['eduLevel']['name']

  41. job_dict['emplType'] = i['emplType']

  42. job_dict['jobname'] = i['jobName']

  43. job_dict['jobType'] = i['jobType']['display']

  44. job_dict['salary'] = i['salary']

  45. job_dict['welfare'] = i['welfare']

  46. job_dict['updateDate'] = i['updateDate']

  47. job_dict['workingExp'] = i['workingExp']['name']

  48. # print(job_dict)

  49. # print(i)

  50. # 去重保存

  51. if unique_data(job_dict):

  52. job_dict = clear_data(job_dict)

  53. save_data(job_dict)

  54. return info_html['data']['numFound']

  55. # 过滤重复数据

  56. companyList = []

  57. jobNameList = []

  58. def unique_data(data):

  59. if (data['jobname'] in jobNameList) & (data['company_name'] in companyList):

  60. return False

  61. else:

  62. companyList.append(data['company_name'])

  63. jobNameList.append(data['jobname'])

  64. return data

  65. # 数据清洗

  66. def clear_data(data):

  67. data['welfare'] = '/'.join([str(i) for i in data['welfare']])

  68. pattern = re.compile('[\u4E00-\u9FA5]+')

  69. data['company_size'] = pattern.sub('',data['company_size'])

  70. return data

  71. # 保存数据

  72. def save_data(data):

  73. data = '::'.join([str(i) for i in data.values()])

  74. print(data)

  75. with open('zlzp.txt', 'a+', encoding='utf-8') as file:

  76. file.write(data + '\n')

  77. # 主函数

  78. if __name__ == '__main__':

  79. '''

  80.     一.请求首页

  81.    '''

  82. starturl = 'https://www.zhaopin.com/'

  83. job_tag_list = get_job_tag(starturl)

  84. # print(job_tag_list)

  85. '''

  86.     二.获取职位详细列表页面

  87.    '''

  88. start = 0

  89. page = 1

  90. while True:

  91. infourl = 'https://fe-api.zhaopin.com/c/i/sou?start={0}pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={1}&kt=3'

  92. numFound = get_job_info(infourl, start, job_tag_list[0])

  93. print('第{0}页'.format(page))

  94. if start<numFound:

  95. start+=60

  96. page+=1

  97. time.sleep(0.5)

  98. else:

  99. break

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

¥ 打赏支持
110人赞 举报
分享到
用户评价(0)

暂无评价,你也可以发布评价哦:)

扫码APP

扫描使用APP

扫码使用

扫描使用小程序