Python爬取前程无忧大数据57000条详细信息看看你到底适合什么？（requests请求

前程无忧大数据网址：https://search.51job.com/list/000000,000000,0000,00,9,99,%E5%A4%A7%E6%95%B0%E6%8D%AE,2,1.html
入口获取所有的大数据相关岗位招聘信息。如从多个招聘网站获取更多招聘信息更好。
提取数据项至少包括以下字段：
（1）职位名称（岗位名称）、公司名称、工作地点、薪资（底薪-上限）、发布时间（月-日）；
说明：在招聘列表中获取

（2）工作年限要求，学历要求，招聘人数职能类别
（3）公司性质公司规模（人数）公司所属行业
（2）和（3）字段对应网页位置：

由于数据量大，详细页面各式不统一，为了能够适应全部数据，条件判断稍微多了点。但是偶尔还是可能会出现SSL证书验证的问题，重新跑即可。

import requestsfrom lxml import etreeimport csvimport logging"""请求页面"""def get_response(url):    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}    logging.captureWarnings(True)    response = requests.get(url,headers=headers,verify=False)    response.encoding = 'gbk'    return response.text"""获取主页面以及详情页内容"""def get_html(html):    html = etree.HTML(html)    infos = html.xpath('//div[@class="el"]')[4:]    info = []    for i in infos:        key = {}        key['job_name'] = str(i.xpath('.//p[@class="t1 "]//a/@title')).strip("[']")        key['company_name'] = str(i.xpath('.//span[@class="t2"]/a/text()')).strip("[']")        key['work_space'] = str(i.xpath('.//span[@class="t3"]/text()')).strip("[']")        key['work_pay'] = str(i.xpath('.//span[@class="t4"]/text()')).strip("[']")        key['pubish_date'] = str(i.xpath('.//span[@class="t5"]/text()')).strip("[']")        if "https" in str(i.xpath('.//p[@class="t1 "]//a/@href')):            key['detail_href'] = "https://jobs.51job.com/"+str(i.xpath('.//p[@class="t1 "]//a/@href')).strip("[']").split("com/")[1]        else:            key['detail_href'] = "https://jobs.51job.com/shanghai-sjq/106755892.html?s=01&t=0"        # if str(i.xpath('.//p[@class="t1 "]//a/@href')).strip("[']").strip(" ").startswith("h") is not True:        #        #     key['detail_href'] = "https://"+str(i.xpath('.//p[@class="t1 "]//a/@href')).strip("[']").strip(" ")        # else:        #     key['detail_href'] = str(i.xpath('.//p[@class="t1 "]//a/@href')).strip("[']").strip(" ")        """        获取详情页信息            工作年限要求 学历要求 招聘人数  职能类别 公司性质  公司规模（人数）  公司所属行业        """        detail_html_str = get_response(key['detail_href'])        detail_html = etree.HTML(detail_html_str)        if "|" in str(detail_html.xpath('.//p[@class="msg ltype"]/@title')):            key['year_limit'] = str(detail_html.xpath('.//p[@class="msg ltype"]/@title')).strip("[']").split('\\xa0\\xa0|\\xa0\\xa0')[1]        else:            key['year_limit'] = "无工作经验"        if "|" in str(detail_html.xpath('.//p[@class="msg ltype"]/@title')):            key['edu_limit'] = str(detail_html.xpath('.//p[@class="msg ltype"]/@title')).strip("[']").split('\\xa0\\xa0|\\xa0\\xa0')[2]        else:            key['edu_limit'] = "本科"        if "|" in str(detail_html.xpath('.//p[@class="msg ltype"]/@title')):            key['recruit_num'] = str(detail_html.xpath('.//p[@class="msg ltype"]/@title')).strip("[']").split('\\xa0\\xa0|\\xa0\\xa0')[3]        else:            key['recruit_num'] = "招若干人"        if str(detail_html.xpath('.//a[@class="el tdn"][1]')) is not False:            key['job_type'] = str(detail_html.xpath('.//a[@class="el tdn"]/text()')).strip("['\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t]").split("\\t\\t\\t\\t\\t\\t\\t\\t\\t")[0]            if "/" in key['job_type']:                key['job_type'] = key['job_type'].split("/")[0]            else:                key['job_type'] = str(detail_html.xpath('.//a[@class="el tdn"]/text()')).strip(                    "['\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t]").split("\\t\\t\\t\\t\\t\\t\\t\\t\\t")[0]        else:            key['job_type'] = "无"        if str(detail_html.xpath('.//p[@class="at"][1]/@title')) is not False:            key['company_property'] = str(detail_html.xpath('.//p[@class="at"][1]/@title')).strip("[']")        else:            key['company_property'] = "无"        if str(detail_html.xpath('.//p[@class="at"][2]/@title')) is not False:            key['company_num'] = str(detail_html.xpath('.//p[@class="at"][2]/@title')).strip("[']")        else:            key['company_num'] = "无"        if str(detail_html.xpath('.//p[@class="at"][3]/@title')) is not False:            key['company_business'] = str(detail_html.xpath('.//p[@class="at"][3]/@title')).strip("[']")        else:            key['company_business'] = "无"        info.append(key)        print(info)    return info"""写入csv文件的head"""def write_header():    headers = ['职位名称', '公司名称', '工作地点', '薪资','发布日期','工作年限要求','学历要求','招聘人数','职能类别','公司性质','公司规模','公司所属行业']    with open('qianchengwuyou.csv', 'a+', encoding='UTF-8', newline='') as fp:        writer = csv.writer(fp)        writer.writerow(headers)"""保存数据"""def save_data(info):    with open('qianchengwuyou.csv','a+',encoding='UTF-8',newline='') as fp:        writer = csv.writer(fp)        # writer.writerow(headers)        for key in info:            writer.writerow([key['job_name'],key['company_name'],key['work_space'],key['work_pay'],key['pubish_date'],key['year_limit'],key['edu_limit'],key['recruit_num'],key['job_type'],key['company_property'],key['company_num'],key['company_business']])if __name__ == '__main__':    write_header()    #通过format构造url列表    urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(i) for i in range(1,1106)]    for url in urls:        html = get_response(url)        info = get_html(html)        save_data(info)

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。