文章詳情頁

python使用bs4爬取boss直聘靜態頁面

瀏覽：7日期：2022-07-08 16:47:30

思路：

1、將需要查詢城市列表，通過城市接口轉換成相應的code碼

2、遍歷城市、職位生成url

3、通過url獲取列表頁面信息，遍歷列表頁面信息

4、再根據列表頁面信息的job_link獲取詳情頁面信息，將需要的信息以字典data的形式存在列表datas里

5、判斷列表頁面是否有下一頁，重復步驟3、4；同時將列表datas一直傳遞下去

6、一個城市、職位url爬取完后，將列表datas接在列表datas_list后面，重復3、4、5

7、最后將列表datas_list的數據，遍歷寫在Excel里面

知識點：

1、將response內容以json形式輸出，解析json并取值

2、soup 的select()和find_all()和find()方法使用

3、異常Exception的使用

4、wldt創建編輯Excel的使用

import requests, time, xlwtfrom bs4 import BeautifulSoupclass MyJob(): def __init__(self, mycity, myquery): self.city = mycity self.query = myquery self.list_url = 'https://www.zhipin.com/job_detail/?query=%s&city=%s&industry=&position='%(self.query, self.city) self.datas = [] self.header = { ’authority’: ’www.zhipin.com’, ’method’: ’GET’, ’scheme’: ’https’, ’accept’: ’text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8’, ’accept-encoding’: ’gzip, deflate, br’, ’accept-language’: ’zh-CN,zh;q=0.9’, ’cache-control’: ’max-age=0’, ’cookie’: ’lastCity=101210100;uab_collina=154408714637849548916323;toUrl=/;c=1558272251;g=-;l=l=%2Fwww.zhipin.com%2Fuser%2Flogin.html&r=; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1555852331,1556985726,1558169427,1558272251; __a=40505844.1544087205.1558169426.1558272251.41.14.4.31; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1558272385’, ’referer’: ’https://www.zhipin.com/?ka=header-logo’, ’upgrade-insecure-requests’: ’1’, ’user-agent’: ’Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36’ } #將城市轉化為code碼 def get_city(self,city_list): city_url = 'https://www.zhipin.com/wapi/zpCommon/data/city.json' #獲取城市 json = requests.get(city_url).json() zpData = json['zpData']['cityList'] list = [] for city in city_list : for data_sf in zpData:for data_dq in data_sf['subLevelModelList']: if city == data_dq['name']: list.append(data_dq['code']) return list #獲取所有頁內容 def get_job_list(self, url, datas): print(url) html = requests.get(url, headers=self.header).text soup = BeautifulSoup(html, ’html.parser’) jobs = soup.select('.job-primary') for job in jobs: data = {} # 招聘id data['job_id'] = job.find_all('div', attrs={'class': 'info-primary'})[0].find('a').get('data-jobid') # 招聘鏈接 data['job_link'] = 'https://www.zhipin.com' + job.find_all('div', attrs={'class': 'info-primary'})[0].find('a').get('href') # 招聘崗位 data['job_name'] = job.find_all('div', attrs={'class': 'info-primary'})[0].find('div', attrs={'class': 'job-title'}).get_text() # 薪資 data['job_red'] = job.find_all('div', attrs={'class': 'info-primary'})[0].find('span', attrs={'class': 'red'}).get_text() # 地址 #工作年限 #學歷 data['job_address'] = job.find_all('div', attrs={'class': 'info-primary'})[0].find('p').get_text().split(' ') # 企業鏈接 data['job_company_link'] = job.find_all('div', attrs={'class': 'info-company'})[0].find('a').get('href') # 企業信息 data['job_company'] = job.find_all('div', attrs={'class': 'info-company'})[0].find('p').get_text().split(' ') # boss鏈接 data['job_publis_link'] = job.find_all('div', attrs={'class': 'info-publis'})[0].find('img').get('src') # boos信息 data['job_publis'] = job.find_all('div', attrs={'class': 'info-publis'})[0].find('h3').get_text().split(' ') time.sleep(5) self.get_job_detail(data) # 獲取job詳情頁內容 print(data) datas.append(data) # 將某條job添加到datas中，直到將當前頁添加完 try: next_url = soup.find('div', attrs={'class': 'page'}).find('a', attrs={'class': 'next'}).get('href') #if next_url[-1] =='3': # 第二頁自動拋異常 if next_url in 'javascript:;': # 最后一頁自動拋異常raise Exception() except Exception as e: print('最后一頁了；%s' % e) return datas # 返回所有頁內容 else: time.sleep(5) next_url = 'https://www.zhipin.com' + next_url self.get_job_list(next_url, datas) return datas # 返回所有頁內容 #獲取詳情頁內容 def get_job_detail(self, data): print(data['job_link']) html = requests.get(data['job_link'], headers=self.header).text soup = BeautifulSoup(html, ’html.parser’) # 招聘公司 data['detail_content_name'] = soup.find_all('div', attrs={'class': 'detail-content'})[0].find('div', attrs={'class': 'name'}).get_text() # 福利 data['detail_primary_tags'] = soup.find_all('div', attrs={'class': 'info-primary'})[0].find('div', attrs={'class': 'job-tags'}).get_text().strip() # 招聘崗位 data['detail_primary_name'] = soup.find_all('div', attrs={'class': 'info-primary'})[0].find('h1').get_text() # 招聘狀態 data['detail_primary_status'] = soup.find_all('div', attrs={'class': 'info-primary'})[0].find('div', attrs={'class': 'job-status'}).get_text() # 薪資 data['detail_primary_salary'] = soup.find_all('div', attrs={'class': 'info-primary'})[0].find('span', attrs={'class': 'salary'}).get_text() # 地址 #工作年限 #學歷 data['detail_primary_address'] = soup.find_all('div', attrs={'class': 'info-primary'})[0].find('p').get_text() # 工作地址 data['detail_content_address'] = soup.find_all('div', attrs={'class': 'detail-content'})[0].find('div', attrs={'class': 'location-address'}).get_text() # 職位描述 data['detail_content_text'] = soup.find_all('div', attrs={'class': 'detail-content'})[0].find('div', attrs={'class': 'text'}).get_text().strip().replace('；', 'n') # boss名字 data['detail_op_name'] = soup.find_all('div', attrs={'class': 'detail-op'})[1].find('h2', attrs={'class': 'name'}).get_text() # boss職位 data['detail_op_job'] = soup.find_all('div', attrs={'class': 'detail-op'})[1].find('p', attrs={'class': 'gray'}).get_text().split('·')[0] # boss狀態 data['detail_op_status'] = soup.find_all('div', attrs={'class': 'detail-op'})[1].find('p', attrs={'class': 'gray'}).get_text().split('·')[1] #將獲取的數據寫入Excel def setExcel(self, datas_list): book = xlwt.Workbook(encoding=’utf-8’) table = book.add_sheet('boss軟件測試') table.write(0, 0, '編號') table.write(0, 1, '招聘鏈接') table.write(0, 2, '招聘崗位') table.write(0, 3, '薪資') table.write(0, 4, '地址') table.write(0, 5, '企業鏈接') table.write(0, 6, '企業信息') table.write(0, 7, 'boss鏈接') table.write(0, 8, 'boss信息') table.write(0, 9, 'detail詳情') i = 1 for data in datas_list: table.write(i, 0, data['job_id']) table.write(i, 1, data['job_link']) table.write(i, 2, data['job_name']) table.write(i, 3, data['job_red']) table.write(i, 4, data['job_address']) table.write(i, 5, data['job_company_link']) table.write(i, 6, data['job_company']) table.write(i, 7, data['job_publis_link']) table.write(i, 8, data['job_publis']) table.write(i, 10, data['detail_content_name']) table.write(i, 11, data['detail_primary_name']) table.write(i, 12, data['detail_primary_status']) table.write(i, 13, data['detail_primary_salary']) table.write(i, 14, data['detail_primary_address']) table.write(i, 15, data['detail_content_text']) table.write(i, 16, data['detail_op_name']) table.write(i, 17, data['detail_op_job']) table.write(i, 18, data['detail_op_status']) table.write(i, 19, data['detail_primary_tags']) table.write(i, 20, data['detail_content_address']) i += 1 book.save(r’C:%s_boss軟件測試.xls’ % time.strftime(’%Y%m%d%H%M%S’)) print('Excel保存成功')if __name__ == ’__main__’: city_list = MyJob('','').get_city(['杭州']) query_list = ['軟件測試', '測試工程師'] datas_list = [] for city in city_list: for query in query_list: myjob = MyJob(city, query) datas = myjob.get_job_list(myjob.list_url, myjob.datas) datas_list.extend(datas) myjob.setExcel(datas_list)

以上就是python使用bs4爬取boss直聘靜態頁面的詳細內容，更多關于python 爬取boss直聘的資料請關注好吧啦網其它相關文章！

上一條：python 裝飾器的使用示例下一條：通過案例解析python鴨子類型相關原理

相關文章：

1. python爬蟲實戰之制作屬于自己的一個IP代理模塊2. Java程序的編碼規范（6）3. HTML 絕對路徑與相對路徑概念詳細4. python 利用toapi庫自動生成api5. Spring如何使用xml創建bean對象6. Android Studio設置顏色拾色器工具Color Picker教程7. python實現在內存中讀寫str和二進制數據代碼8. IntelliJ IDEA設置默認瀏覽器的方法9. python實現PolynomialFeatures多項式的方法10. python實現讀取類別頻數數據畫水平條形圖案例

排行榜

					
					python爬蟲實戰之制作屬于自己的一個IP代理模塊
python實現在內存中讀寫str和二進制數據代碼
HTML 絕對路徑與相對路徑概念詳細
python 利用toapi庫自動生成api
Java程序的編碼規范（6）
python實現PolynomialFeatures多項式的方法
IntelliJ IDEA設置默認瀏覽器的方法
Spring如何使用xml創建bean對象
Android Studio設置顏色拾色器工具Color Picker教程
python實現讀取類別頻數數據畫水平條形圖案例
Spring教程之refresh()執行邏輯淺析