目录
一、网页爬虫 — DrissionPage 模块
DrissionPage 模块文档:【DrissionPage官网】
1. Boss 直聘(滑动翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import time
import csv
def deal_with(data_r, cd):
# 键值对取值,提取信息为列表
data_list = data_r['zpData']['jobList']
# for 循环遍历,提取列表里的元素
for index in data_list:
# 分割薪资制度
salary_list = index['salaryDesc'].split('·')
salary = salary_list[0]
if len(salary_list) == 2:
salary_system = salary_list[1]
else:
salary_system = '12薪'
# 提取具体数据信息保存字典中
temporarily_dict = {
'公司名称': index['brandName'],
'公司行业': index['brandIndustry'],
'公司规模': index['brandScaleName'],
'融资阶段': index['brandStageName'],
'工作区域': index['cityName'] + ' ' + index['areaDistrict'] + ' ' + index['businessDistrict'],
'学历要求': index['jobDegree'],
'工作经验': index['jobExperience'],
'职位名称': index['jobName'],
'薪资待遇': salary,
'薪资制度': salary_system,
'沟通职员': index['bossTitle'] + '-' + index['bossName'],
'所需技能': ' '.join(index['skills']),
'公司福利': ' '.join(index['welfareList']),
}
cd.writerow(temporarily_dict)
def main():
# 打开浏览器(实例化浏览器对象)
google = ChromiumPage()
# 监听数据包(自行修改)
google.listen.start(r"wapi/zpgeek/search/joblist.json")
# 访问指定网站的页面(自行修改)
google.get(r"https://www.zhipin.com/web/geek/jobs?city=101280100&query=%E9%A1%B9%E7%9B%AE%E5%8A%A9%E7%90%86")
# 创建文件对象
f = open('boss_project_assistant.csv', mode='a', encoding='utf-8-sig', newline='')
# 字典写入方法
cd = csv.DictWriter(f, fieldnames=['公司名称', '公司行业', '公司规模', '融资阶段', '工作区域', '学历要求',
'工作经验', '职位名称', '薪资待遇', '薪资制度', '沟通职员', '所需技能',
'公司福利'])
cd.writeheader()
num = 50
for page in range(1, num + 1):
print(f'正在处理第 {page} 页数据……')
# 获取数据包加载
try:
data_load = google.listen.wait(timeout=2)
except TimeoutError:
print('Overtime')
exit(0)
else:
if data_load:
# 获取响应数据(字典)
data_response = data_load.response.body
# 处理数据
deal_with(data_response, cd)
if page < num:
# 下滑页面到底部
google.scroll.to_bottom()
time.sleep(1)
else:
print('No more information!')
exit(1)
if __name__ == '__main__':
main()
运行结果展示:
2. 前程无忧(点击翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import csv
def deal_with(data_r, cd):
# 键值对取值,提取信息为列表
data_list = data_r['resultbody']['job']['items']
# for 循环遍历,提取列表里的元素
for index in data_list:
# 处理薪资
salary_list = index['provideSalaryString'].split('·')
salary = salary_list[0]
salary_system = '12薪'
if len(salary_list) == 2:
salary_system = salary_list[1]
# 处理地区
district_string = '未知'
if 'districtString' in index['jobAreaLevelDetail']:
district_string = index['jobAreaLevelDetail']['districtString']
# 处理公司类型
company_type = index['companyIndustryType1Str']
if 'companyIndustryType2Str' in index and index['companyIndustryType2Str'] != index['companyIndustryType1Str']:
company_type = index['companyIndustryType1Str'] + ';' + index['companyIndustryType2Str']
# 处理 HR 状态
hr_labels, hr_active_status_green, hr_info = '未知', '未知', '未知'
if 'hrLabels' in index and index['hrLabels'] != []:
hr_labels = index['hrLabels'][0]
if 'hrActiveStatusGreen' in index:
hr_active_status_green = index['hrActiveStatusGreen']
if 'hrPosition' in index and 'hrName' in index:
hr_info = index['hrPosition'] + '-' + index['hrName']
# 提取具体数据信息保存字典中
temporarily_dict = {
'公司名称': index['fullCompanyName'],
'公司性质': index['companyTypeString'],
'公司领域': company_type,
'公司规模': index['companySizeString'],
'职位名称': index['jobName'],
'优先专业': index['major1Str'] + ' ' + index['major2Str'],
'所在省份': index['jobAreaLevelDetail']['provinceString'],
'所在城市': index['jobAreaLevelDetail']['cityString'],
'所在地区': district_string,
'薪资范围': salary,
'薪资制度': salary_system,
'工作形式': index['termStr'],
'所需学历': index['degreeString'],
'所需经验': index['workYearString'],
'沟通HR': hr_info,
'处理速度': hr_labels,
'在线时间': hr_active_status_green,
'投递频率': index['applyTimeText'],
'公司详情页': index['companyHref'],
'其他标签': ','.join(index['jobTags'])
}
cd.writerow(temporarily_dict)
def main():
# 打开浏览器(实例化浏览器对象)
google = ChromiumPage()
# 监听数据包
google.listen.start(r"api/job/search-pc")
# 访问指定网站的页面
google.get(r"https://we.51job.com/pc/search?jobArea=260200&keyword=Python&searchType=2&keywordType=")
# 创建文件对象
f = open('51job_artificial_intelligence.csv', mode='a', encoding='utf-8-sig', newline='')
# 字典写入方法
cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模',
'职位名称', '优先专业', '所在省份', '所在城市',
'所在地区', '薪资范围', '薪资制度', '工作形式',
'所需学历', '所需经验', '沟通HR', '处理速度',
'在线时间', '投递频率', '公司详情页', '其他标签'])
cd.writeheader()
num = 10
for page in range(1, num + 1):
if page == 1:
# 滑到页面底部
google.scroll.to_bottom()
# 定位下一页按钮并点击
button = google.ele('css:.el-icon-arrow-right')
button.run_js('this.click();')
google.scroll.to_bottom()
# 暂停监听,清空已获取队列
google.listen.pause(clear=True)
# 继续暂停的监听
google.listen.resume()
# 定位上一页按钮并点击
button = google.ele('css:.el-icon-arrow-left')
button.run_js('this.click();')
# 获取数据包加载
try:
data_load = google.listen.wait(timeout=2)
except TimeoutError:
print('Overtime')
exit(0)
else:
if data_load:
print(f'正在处理第 {page} 页数据……')
# 获取响应数据(字典)
data_response = data_load.response.body
# 处理数据
deal_with(data_response, cd)
if page < num:
# 滑到页面底部
google.scroll.to_bottom()
# 定位下一页按钮并点击
button = google.ele('css:.el-icon-arrow-right')
button.run_js('this.click();')
else:
print('No more information!')
exit(1)
if __name__ == '__main__':
main()
dp.ele()
→ 通过元素面板定位元素位置,其中 dp 为浏览器对象;ele 为元素面板的缩写。
运行结果展示:
3. 智联招聘(点击翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import csv
def deal_with(data_r, cd):
# 键值对取值,提取信息为列表
data_list = data_r['data']['list']
# for 循环遍历,提取列表里的元素
for index in data_list:
# 处理技能
skill_result = ''
for skill_dictionary in index['skillLabel']:
for key, value in skill_dictionary.items():
if key == 'value':
skill_result += value + ' '
# 处理薪资
salary_list = index['salary60'].split('·')
salary = salary_list[0]
salary_system = '12薪'
if len(salary_list) == 2:
salary_system = salary_list[1]
# 处理福利
welfare_str = ' '.join(index['welfareTagList'])
if 'jobKnowledgeWelfareFeatures' in index and len(index['jobKnowledgeWelfareFeatures']) > len(
index['welfareTagList']):
welfare_str = ' '.join(index['jobKnowledgeWelfareFeatures'])
# 处理 HR 回复速度
hr_processing_speed = '未知'
if 'hrStateInfo' in index and len(index['hrStateInfo']) > 0:
hr_processing_speed = index['hrStateInfo']
# 提取具体数据信息保存字典中
temporarily_dict = {
'公司名称': index['companyName'],
'公司性质': index['property'],
'公司领域': index['industryName'],
'公司规模': index['companySize'],
'职位名称': index['name'],
'所在城市': index['workCity'],
'所在地区': index['cityDistrict'],
'所在街道': index['streetName'],
'公司源址': index['jobRootOrgInfo']['cityName'],
'薪资范围': salary,
'薪资制度': salary_system,
'工作形式': index['workType'],
'所需学历': index['education'],
'所需经验': index['workingExp'],
'所需技能': skill_result,
'沟通HR': index['staffCard']['hrJob'] + '-' + index['staffCard']['staffName'],
'处理速度': hr_processing_speed,
'在线时间': index['staffCard']['hrOnlineState'],
'公司详情页': index['companyUrl'],
'职位详情页': index['positionUrl'],
'其他福利': welfare_str
}
cd.writerow(temporarily_dict)
def main():
# 打开浏览器(实例化浏览器对象)
google = ChromiumPage()
# 监听数据包
google.listen.start(r"c/i/search/positions")
# 访问指定网站的页面
google.get(r"https://www.zhaopin.com/sou/jl765/kw01800U80EG06G03F01N0/p2?kt=3")
# 创建文件对象
f = open('zhaopin_python.csv', mode='a', encoding='utf-8-sig', newline='')
# 字典写入方法
cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模', '职位名称',
'所在城市', '所在地区', '所在街道', '公司源址', '薪资范围',
'薪资制度', '工作形式', '所需学历', '所需经验', '所需技能',
'沟通HR', '处理速度', '在线时间', '公司详情页', '职位详情页', '其他福利'])
cd.writeheader()
num = 10
for page in range(1, num + 1):
if page == 1:
# 暂停监听,清空已获取队列
google.listen.pause(clear=True)
# 继续暂停的监听
google.listen.resume()
# 定位上一页按钮并点击
button = google.ele('css:.soupager a:first-of-type')
# 滑动页面
google.scroll.to_see(button)
button.run_js('this.click();')
# 获取数据包加载
try:
data_load = google.listen.wait(timeout=2)
except TimeoutError:
print('Overtime')
exit(0)
else:
if data_load:
print(f'正在处理第 {page} 页数据……')
# 获取响应数据(字典)
data_response = data_load.response.body
# 处理数据
deal_with(data_response, cd)
if page < num:
# 定位下一页按钮并点击
button = google.ele('css:.soupager a:last-of-type')
# 滑动页面
google.scroll.to_see(button)
button.run_js('this.click();')
else:
print('No more information!')
exit(1)
if __name__ == '__main__':
main()
css:.soupager a:first-of-type
→ css:.soupager 定位类名为 soupager 的标签;a:first-of-type 表示提取第一个 a 标签。css:.soupager a:last-of-type
→ css:.soupager 定位类名为 soupager 的标签;a:last-of-type 表示提取最后一个 a 标签。a:nth-of-type(even)
表示提取偶数位置的 a 标签;nth-of-type(odd)
表示提取奇数位置的 a 标签。
运行结果展示:
4. 猎聘网(点击翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import csv
def deal_with(data_r, cd):
# 键值对取值,提取信息为列表
data_list = data_r['data']['soJobForms']
# for 循环遍历,提取列表里的元素
for index in data_list:
# 处理薪资
salary_list = index['salary'].split('·')
salary_value = salary_list[0]
salary_system = '12薪'
if len(salary_list) == 2:
salary_system = salary_list[1]
# 处理技能和福利
skill_value, welfare_value = '', ''
if 'jobLabels' in index and 'sellingPointList' in index:
skill_list = [item for item in index['jobLabels'] if item not in index['sellingPointList']]
if len(skill_list) >= 1:
skill_value = ';'.join(skill_list)
if len(index['sellingPointList']) >= 1:
welfare_value = ';'.join(index['sellingPointList'])
elif 'jobLabels' in index:
if len(index['jobLabels']) >= 1:
skill_value = ';'.join(index['jobLabels'])
elif 'sellingPointList' in index:
if len(index['sellingPointList']) >= 1:
welfare_value = ';'.join(index['sellingPointList'])
# 处理沟通 HR
hr = index['recruiterName']
if 'recruiterTitle' in index:
hr = index['recruiterTitle'] + '-' + index['recruiterName']
# 处理公司规模
scale_value = '未知'
if 'compScale' in index:
scale_value = index['compScale']
# 提取具体数据信息保存字典中
temporarily_dict = {
'公司名称': index['company'],
'公司领域': index['industry'],
'公司规模': scale_value,
'职位名称': index['title'],
'所在地址': index['dq'],
'薪资范围': salary_value,
'薪资制度': salary_system,
'所需学历': index['requireEduLevel'],
'所需技能': skill_value,
'所需经验': index['requireWorkYears'],
'沟通HR': hr,
'发布时间': index['date'],
'公司福利': welfare_value
}
cd.writerow(temporarily_dict)
def main():
# 打开浏览器(实例化浏览器对象)
google = ChromiumPage()
# 监听数据包
google.listen.start(r"api/com.liepin.searchfront4c.h5-search-job")
# 访问指定网站的页面
google.get(r"https://m.liepin.com/zhaopin/?dqs=170020&keyword=Python")
# 创建文件对象
f = open('liepin_python.csv', mode='a', encoding='utf-8-sig', newline='')
# 字典写入方法
cd = csv.DictWriter(f, fieldnames=['公司名称', '公司领域', '公司规模', '职位名称', '所在地址',
'薪资范围', '薪资制度', '所需学历', '所需技能', '所需经验',
'沟通HR', '发布时间', '公司福利'])
cd.writeheader()
num = 10
for page in range(1, num + 1):
try:
data_load = google.listen.wait(timeout=2)
except TimeoutError:
print('Overtime')
exit(0)
else:
if data_load:
print(f'正在处理第 {page} 页数据……')
# 获取响应数据(字典)
data_response = data_load.response.body
# 处理数据
deal_with(data_response, cd)
if page < num:
# 滑动页面
google.scroll.to_bottom()
# 定位下一页按钮并点击
button = google('下一页')
button.run_js('this.click();')
else:
print('No more information!')
exit(1)
if __name__ == '__main__':
main()
运行结果展示:
二、网页爬虫 — Requests 模块
1. 得物(JS 逆向加密)
js_file.js 模块代码:【对得物进行爬虫时使用到的 js 模块】
Python 代码:
import requests
import pandas as pd
# 导入编译 js 代码模块
import execjs
# ------------------------------------------------
import openpyxl
from openpyxl.drawing.image import Image as xlImage
from openpyxl.utils import get_column_letter
from PIL import Image
from io import BytesIO
def get_data_xlsx(js_path, save_path):
# 请求标头
request_header = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br, zstd',
'accept-language': 'zh-CN,zh;q=0.9',
'connection': 'keep-alive',
'content-length': '124',
'content-type': 'application/json',
'cookie': '...',
'host': 'app.dewu.com',
'ltk': '...',
'origin': 'https://www.dewu.com',
'referer': 'https://www.dewu.com/',
'sec-ch-ua': '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'sessionid': '...',
'shumeiid': '...',
'sk': '',
'traceparent': '...',
'user-agent': '...'
}
# 请求网址
request_url = r'https://app.dewu.com/api/v1/h5/commodity-pick-interfaces/pc/pick-rule-result/feeds/info'
# 请求载荷
request_parameters = {
'filterUnbid': True,
'pageNum': 1, # 页码
'pageSize': 24,
'pickRuleId': 644443, # 类目 ID
'showCspu': True
}
# 编译 js 代码
js_code = execjs.compile(open(js_path, encoding='utf-8').read())
# 获取 sign 加密参数
sign_data = js_code.call('c', request_parameters)
# 0e5d10fb111f2afef6ac0a1776187e23
# 将 sign 添加至请求载荷中
request_parameters['sign'] = sign_data
print('Data is being requested and processed…')
# 请求数据
response = requests.post(url=request_url, json=request_parameters, headers=request_header)
# 获取数据
data_json = response.json()
# 创建一个空列表
dewu_info = []
# 解析数据
info_list = data_json['data']['list']
for index in info_list:
info_dict = {
'标题': index['title'],
'价格': index['price'] / 100,
'图片网址': index['logoUrl']
}
# 写入数据
dewu_info.append(info_dict)
# 转换数据
df = pd.DataFrame(dewu_info)
# 导出保存为 Excel 表格
df.to_excel(save_path, index=False)
print(f'The data is already saved in {save_path}')
def download_image(url):
rg_url = requests.get(url)
# 检查响应状态码
if rg_url.status_code == 200:
# 创建图像对象
image = Image.open(BytesIO(rg_url.content))
# 统一图像类型
if image.mode != 'RGB':
image = image.convert('RGB')
# 调整图像大小
return image.resize((150, 96))
else:
raise Exception(f"Unable to download images, status codes: {rg_url.status_code}")
def link_to_png(source_path, destination_path):
# 加载 Excel 文件
wb = openpyxl.load_workbook(source_path)
# 默认为第一个 sheet
sheet = wb.active
# 调整行高和列宽
for row in range(2, sheet.max_row + 1):
sheet.row_dimensions[row].height = 75
sheet.column_dimensions['C'].width = 20
# 读取链接并下载图片插入到对应位置
for row in range(2, sheet.max_row + 1):
# 假设图片链接在第 2 行开始,第 C 列是链接(对应 column = 3),获取链接单元格的值
link = sheet.cell(row=row, column=3).value
# 清空内容
sheet.cell(row=row, column=3).value = None
# 如果链接不为空
if link:
# 发送 HTTP 请求下载图片
try:
# 尝试下载图像
resized_image = download_image(link)
except OSError:
print(f"Failed to download image {link}")
continue
else:
# 将调整后的图像插入到工作表中
img_bytes = BytesIO()
resized_image.save(img_bytes, format='PNG') # 将图片保存到内存中
img = xlImage(img_bytes)
sheet.add_image(img, f'{get_column_letter(3)}{row}') # 插入图片到指定位置
wb.save(destination_path) # 必要
wb.close() # 必要
if __name__ == '__main__':
j_path = './js_file.js'
s_path = './dewu_link.xlsx'
# 获取数据并保存为 Excel 文件
get_data_xlsx(j_path, s_path)
d_path = './dewu_png.xlsx'
print('Excel file is being processed…')
link_to_png(s_path, d_path)
print(f'The data is already saved in {d_path}')
运行结果展示:
2. 闲鱼(MD5 加密)
Python 代码:
# 导入数据请求模块
import requests
import csv
# 导入哈希模块
import hashlib
import time
def get_sign(page):
d_token = '...' # d_token 具有时效性,自行填写
j = int(time.time() * 1000)
h = '34839810'
c_data = ('{"pageNumber": %d, '
'"keyword": "python爬虫书籍", '
'"fromFilter": false, '
'"rowsPerPage": 30, '
'"sortValue": "", '
'"sortField": "", '
'"customDistance": "", '
'"gps": "", '
'"propValueStr": {}, '
'"customGps": "", '
'"searchReqFromPage": "pcSearch", '
'"extraFilterValue": "{}", '
'"userPositionJson": "{}"}') % page
result_str = d_token + "&" + str(j) + "&" + h + "&" + c_data
# 使用 md5 加密
md_str = hashlib.md5()
# 传入加密参数
md_str.update(result_str.encode('utf-8'))
# 进行加密处理
sign = md_str.hexdigest()
return sign, j, c_data
def get_data_csv(file_path, head_name):
# 模拟浏览器(请求标头)
request_header = {
'Referer': 'https://www.goofish.com/',
# cookie 代表用户信息,常用于检测是否有登陆账户(不论是否登录都有 cookie)
# cookie 具有时效性,自行填写
'Cookie': '...',
# user-agent 代表用户代理,显示浏览器 / 设备的基本身份信息
'User-Agent': '...'
}
# 请求网址
request_url = r'https://h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search/1.0/'
# 创建文件对象
f = open(file_path, mode='a', encoding='utf-8-sig', newline='')
# 字典写入方法
cd = csv.DictWriter(f, fieldnames=head_name)
cd.writeheader()
# for 构建循环翻页
num = 10
for i in range(1, num + 1):
print(f'正在采集第 {i} 页数据…')
# 获取 sign 加密参数、时间戳和表单数据
sign, j_time, c_data = get_sign(i)
# 查询参数
query_parameters = {
'jsv': '2.7.2',
'appKey': '34839810',
't': str(j_time),
'sign': sign,
'v': '1.0',
'type': 'originaljson',
'accountSite': 'xianyu',
'dataType': 'json',
'timeout': '20000',
'api': 'mtop.taobao.idlemtopsearch.pc.search',
'sessionOption': 'AutoLoginOnly',
'spm_cnt': 'a21ybx.search.0.0',
'spm_pre': 'a21ybx.home.searchSuggest.1.4c053da6IXTxSx',
'log_id': '4c053da6IXTxSx'
}
# 表单数据
form_data = {"data": c_data}
# 发送请求
response = requests.post(url=request_url, params=query_parameters, data=form_data, headers=request_header)
# 获取响应的 json 数据 → 字典数据类型
data_json = response.json()
# 键值对取值,提取商品信息所在列表
info_list = data_json['data']['resultList']
# for 循环遍历,提取列表里的元素
for index in info_list:
# 处理用户名
nick_name = '未知'
if 'userNickName' in index['data']['item']['main']['exContent']:
nick_name = index['data']['item']['main']['exContent']['userNickName']
# 处理售价
price_list = index['data']['item']['main']['exContent']['price']
price = ''
for p in price_list:
price += p['text']
# 处理详情页链接
item_id = index['data']['item']['main']['exContent']['itemId']
link = f'https://www.goofish.com/item?id={item_id}'
temporarily_dict = {
'标题': index['data']['item']['main']['exContent']['title'],
'地区': index['data']['item']['main']['exContent']['area'],
'售价': price,
'用户名': nick_name,
'详情页链接': link
}
cd.writerow(temporarily_dict)
f.close()
if __name__ == '__main__':
f_path = './fish_python.csv'
h_name = ['标题', '地区', '售价', '用户名', '详情页链接']
get_data_csv(f_path, h_name)
运行结果展示: