Python 爬虫案例(不定期更新)

发布于:2025-06-22 ⋅ 阅读:(23) ⋅ 点赞:(0)


一、网页爬虫 — DrissionPage 模块

DrissionPage 模块文档:【DrissionPage官网

1. Boss 直聘(滑动翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import time
import csv


def deal_with(data_r, cd):
    # 键值对取值,提取信息为列表
    data_list = data_r['zpData']['jobList']
    # for 循环遍历,提取列表里的元素
    for index in data_list:
        # 分割薪资制度
        salary_list = index['salaryDesc'].split('·')
        salary = salary_list[0]
        if len(salary_list) == 2:
            salary_system = salary_list[1]
        else:
            salary_system = '12薪'
        # 提取具体数据信息保存字典中
        temporarily_dict = {
            '公司名称': index['brandName'],
            '公司行业': index['brandIndustry'],
            '公司规模': index['brandScaleName'],
            '融资阶段': index['brandStageName'],
            '工作区域': index['cityName'] + ' ' + index['areaDistrict'] + ' ' + index['businessDistrict'],
            '学历要求': index['jobDegree'],
            '工作经验': index['jobExperience'],
            '职位名称': index['jobName'],
            '薪资待遇': salary,
            '薪资制度': salary_system,
            '沟通职员': index['bossTitle'] + '-' + index['bossName'],
            '所需技能': ' '.join(index['skills']),
            '公司福利': ' '.join(index['welfareList']),
        }
        cd.writerow(temporarily_dict)


def main():
    # 打开浏览器(实例化浏览器对象)
    google = ChromiumPage()
    # 监听数据包(自行修改)
    google.listen.start(r"wapi/zpgeek/search/joblist.json")
    # 访问指定网站的页面(自行修改)
    google.get(r"https://www.zhipin.com/web/geek/jobs?city=101280100&query=%E9%A1%B9%E7%9B%AE%E5%8A%A9%E7%90%86")

    # 创建文件对象
    f = open('boss_project_assistant.csv', mode='a', encoding='utf-8-sig', newline='')
    # 字典写入方法
    cd = csv.DictWriter(f, fieldnames=['公司名称', '公司行业', '公司规模', '融资阶段', '工作区域', '学历要求',
                                       '工作经验', '职位名称', '薪资待遇', '薪资制度', '沟通职员', '所需技能',
                                       '公司福利'])
    cd.writeheader()

    num = 50
    for page in range(1, num + 1):
        print(f'正在处理第 {page} 页数据……')
        # 获取数据包加载
        try:
            data_load = google.listen.wait(timeout=2)
        except TimeoutError:
            print('Overtime')
            exit(0)
        else:
            if data_load:
                # 获取响应数据(字典)
                data_response = data_load.response.body
                # 处理数据
                deal_with(data_response, cd)
                if page < num:
                    # 下滑页面到底部
                    google.scroll.to_bottom()
                    time.sleep(1)
            else:
                print('No more information!')
                exit(1)


if __name__ == '__main__':
    main()

运行结果展示:

2. 前程无忧(点击翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import csv


def deal_with(data_r, cd):
    # 键值对取值,提取信息为列表
    data_list = data_r['resultbody']['job']['items']
    # for 循环遍历,提取列表里的元素
    for index in data_list:
        # 处理薪资
        salary_list = index['provideSalaryString'].split('·')
        salary = salary_list[0]
        salary_system = '12薪'
        if len(salary_list) == 2:
            salary_system = salary_list[1]
        # 处理地区
        district_string = '未知'
        if 'districtString' in index['jobAreaLevelDetail']:
            district_string = index['jobAreaLevelDetail']['districtString']
        # 处理公司类型
        company_type = index['companyIndustryType1Str']
        if 'companyIndustryType2Str' in index and index['companyIndustryType2Str'] != index['companyIndustryType1Str']:
            company_type = index['companyIndustryType1Str'] + ';' + index['companyIndustryType2Str']
        # 处理 HR 状态
        hr_labels, hr_active_status_green, hr_info = '未知', '未知', '未知'
        if 'hrLabels' in index and index['hrLabels'] != []:
            hr_labels = index['hrLabels'][0]
        if 'hrActiveStatusGreen' in index:
            hr_active_status_green = index['hrActiveStatusGreen']
        if 'hrPosition' in index and 'hrName' in index:
            hr_info = index['hrPosition'] + '-' + index['hrName']
        # 提取具体数据信息保存字典中
        temporarily_dict = {
            '公司名称': index['fullCompanyName'],
            '公司性质': index['companyTypeString'],
            '公司领域': company_type,
            '公司规模': index['companySizeString'],
            '职位名称': index['jobName'],
            '优先专业': index['major1Str'] + ' ' + index['major2Str'],
            '所在省份': index['jobAreaLevelDetail']['provinceString'],
            '所在城市': index['jobAreaLevelDetail']['cityString'],
            '所在地区': district_string,
            '薪资范围': salary,
            '薪资制度': salary_system,
            '工作形式': index['termStr'],
            '所需学历': index['degreeString'],
            '所需经验': index['workYearString'],
            '沟通HR': hr_info,
            '处理速度': hr_labels,
            '在线时间': hr_active_status_green,
            '投递频率': index['applyTimeText'],
            '公司详情页': index['companyHref'],
            '其他标签': ','.join(index['jobTags'])
        }
        cd.writerow(temporarily_dict)


def main():
    # 打开浏览器(实例化浏览器对象)
    google = ChromiumPage()
    # 监听数据包
    google.listen.start(r"api/job/search-pc")
    # 访问指定网站的页面
    google.get(r"https://we.51job.com/pc/search?jobArea=260200&keyword=Python&searchType=2&keywordType=")

    # 创建文件对象
    f = open('51job_artificial_intelligence.csv', mode='a', encoding='utf-8-sig', newline='')
    # 字典写入方法
    cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模',
                                       '职位名称', '优先专业', '所在省份', '所在城市',
                                       '所在地区', '薪资范围', '薪资制度', '工作形式',
                                       '所需学历', '所需经验', '沟通HR', '处理速度',
                                       '在线时间', '投递频率', '公司详情页', '其他标签'])
    cd.writeheader()

    num = 10
    for page in range(1, num + 1):
        if page == 1:
            # 滑到页面底部
            google.scroll.to_bottom()
            # 定位下一页按钮并点击
            button = google.ele('css:.el-icon-arrow-right')
            button.run_js('this.click();')
            google.scroll.to_bottom()
            # 暂停监听,清空已获取队列
            google.listen.pause(clear=True)
            # 继续暂停的监听
            google.listen.resume()
            # 定位上一页按钮并点击
            button = google.ele('css:.el-icon-arrow-left')
            button.run_js('this.click();')
        # 获取数据包加载
        try:
            data_load = google.listen.wait(timeout=2)
        except TimeoutError:
            print('Overtime')
            exit(0)
        else:
            if data_load:
                print(f'正在处理第 {page} 页数据……')
                # 获取响应数据(字典)
                data_response = data_load.response.body
                # 处理数据
                deal_with(data_response, cd)
                if page < num:
                    # 滑到页面底部
                    google.scroll.to_bottom()
                    # 定位下一页按钮并点击
                    button = google.ele('css:.el-icon-arrow-right')
                    button.run_js('this.click();')
            else:
                print('No more information!')
                exit(1)


if __name__ == '__main__':
    main()

dp.ele() → 通过元素面板定位元素位置,其中 dp 为浏览器对象;ele 为元素面板的缩写。

运行结果展示:

3. 智联招聘(点击翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import csv


def deal_with(data_r, cd):
    # 键值对取值,提取信息为列表
    data_list = data_r['data']['list']
    # for 循环遍历,提取列表里的元素
    for index in data_list:
        # 处理技能
        skill_result = ''
        for skill_dictionary in index['skillLabel']:
            for key, value in skill_dictionary.items():
                if key == 'value':
                    skill_result += value + ' '
        # 处理薪资
        salary_list = index['salary60'].split('·')
        salary = salary_list[0]
        salary_system = '12薪'
        if len(salary_list) == 2:
            salary_system = salary_list[1]
        # 处理福利
        welfare_str = ' '.join(index['welfareTagList'])
        if 'jobKnowledgeWelfareFeatures' in index and len(index['jobKnowledgeWelfareFeatures']) > len(
                index['welfareTagList']):
            welfare_str = ' '.join(index['jobKnowledgeWelfareFeatures'])
        # 处理 HR 回复速度
        hr_processing_speed = '未知'
        if 'hrStateInfo' in index and len(index['hrStateInfo']) > 0:
            hr_processing_speed = index['hrStateInfo']

        # 提取具体数据信息保存字典中
        temporarily_dict = {
            '公司名称': index['companyName'],
            '公司性质': index['property'],
            '公司领域': index['industryName'],
            '公司规模': index['companySize'],
            '职位名称': index['name'],
            '所在城市': index['workCity'],
            '所在地区': index['cityDistrict'],
            '所在街道': index['streetName'],
            '公司源址': index['jobRootOrgInfo']['cityName'],
            '薪资范围': salary,
            '薪资制度': salary_system,
            '工作形式': index['workType'],
            '所需学历': index['education'],
            '所需经验': index['workingExp'],
            '所需技能': skill_result,
            '沟通HR': index['staffCard']['hrJob'] + '-' + index['staffCard']['staffName'],
            '处理速度': hr_processing_speed,
            '在线时间': index['staffCard']['hrOnlineState'],
            '公司详情页': index['companyUrl'],
            '职位详情页': index['positionUrl'],
            '其他福利': welfare_str
        }
        cd.writerow(temporarily_dict)


def main():
    # 打开浏览器(实例化浏览器对象)
    google = ChromiumPage()
    # 监听数据包
    google.listen.start(r"c/i/search/positions")
    # 访问指定网站的页面
    google.get(r"https://www.zhaopin.com/sou/jl765/kw01800U80EG06G03F01N0/p2?kt=3")

    # 创建文件对象
    f = open('zhaopin_python.csv', mode='a', encoding='utf-8-sig', newline='')
    # 字典写入方法
    cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模', '职位名称',
                                       '所在城市', '所在地区', '所在街道', '公司源址', '薪资范围',
                                       '薪资制度', '工作形式', '所需学历', '所需经验', '所需技能',
                                       '沟通HR', '处理速度', '在线时间', '公司详情页', '职位详情页', '其他福利'])
    cd.writeheader()

    num = 10
    for page in range(1, num + 1):
        if page == 1:
            # 暂停监听,清空已获取队列
            google.listen.pause(clear=True)
            # 继续暂停的监听
            google.listen.resume()
            # 定位上一页按钮并点击
            button = google.ele('css:.soupager a:first-of-type')
            # 滑动页面
            google.scroll.to_see(button)
            button.run_js('this.click();')
        # 获取数据包加载
        try:
            data_load = google.listen.wait(timeout=2)
        except TimeoutError:
            print('Overtime')
            exit(0)
        else:
            if data_load:
                print(f'正在处理第 {page} 页数据……')
                # 获取响应数据(字典)
                data_response = data_load.response.body
                # 处理数据
                deal_with(data_response, cd)
                if page < num:
                    # 定位下一页按钮并点击
                    button = google.ele('css:.soupager a:last-of-type')
                    # 滑动页面
                    google.scroll.to_see(button)
                    button.run_js('this.click();')
            else:
                print('No more information!')
                exit(1)


if __name__ == '__main__':
    main()
  • css:.soupager a:first-of-type → css:.soupager 定位类名为 soupager 的标签;a:first-of-type 表示提取第一个 a 标签。
  • css:.soupager a:last-of-type → css:.soupager 定位类名为 soupager 的标签;a:last-of-type 表示提取最后一个 a 标签。
  • a:nth-of-type(even) 表示提取偶数位置的 a 标签;nth-of-type(odd) 表示提取奇数位置的 a 标签。

运行结果展示:

4. 猎聘网(点击翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import csv


def deal_with(data_r, cd):
    # 键值对取值,提取信息为列表
    data_list = data_r['data']['soJobForms']
    # for 循环遍历,提取列表里的元素
    for index in data_list:
        # 处理薪资
        salary_list = index['salary'].split('·')
        salary_value = salary_list[0]
        salary_system = '12薪'
        if len(salary_list) == 2:
            salary_system = salary_list[1]
        # 处理技能和福利
        skill_value, welfare_value = '', ''
        if 'jobLabels' in index and 'sellingPointList' in index:
            skill_list = [item for item in index['jobLabels'] if item not in index['sellingPointList']]
            if len(skill_list) >= 1:
                skill_value = ';'.join(skill_list)
            if len(index['sellingPointList']) >= 1:
                welfare_value = ';'.join(index['sellingPointList'])
        elif 'jobLabels' in index:
            if len(index['jobLabels']) >= 1:
                skill_value = ';'.join(index['jobLabels'])
        elif 'sellingPointList' in index:
            if len(index['sellingPointList']) >= 1:
                welfare_value = ';'.join(index['sellingPointList'])
        # 处理沟通 HR
        hr = index['recruiterName']
        if 'recruiterTitle' in index:
            hr = index['recruiterTitle'] + '-' + index['recruiterName']
        # 处理公司规模
        scale_value = '未知'
        if 'compScale' in index:
            scale_value = index['compScale']

        # 提取具体数据信息保存字典中
        temporarily_dict = {
            '公司名称': index['company'],
            '公司领域': index['industry'],
            '公司规模': scale_value,
            '职位名称': index['title'],
            '所在地址': index['dq'],
            '薪资范围': salary_value,
            '薪资制度': salary_system,
            '所需学历': index['requireEduLevel'],
            '所需技能': skill_value,
            '所需经验': index['requireWorkYears'],
            '沟通HR': hr,
            '发布时间': index['date'],
            '公司福利': welfare_value
        }
        cd.writerow(temporarily_dict)


def main():
    # 打开浏览器(实例化浏览器对象)
    google = ChromiumPage()
    # 监听数据包
    google.listen.start(r"api/com.liepin.searchfront4c.h5-search-job")
    # 访问指定网站的页面
    google.get(r"https://m.liepin.com/zhaopin/?dqs=170020&keyword=Python")

    # 创建文件对象
    f = open('liepin_python.csv', mode='a', encoding='utf-8-sig', newline='')
    # 字典写入方法
    cd = csv.DictWriter(f, fieldnames=['公司名称', '公司领域', '公司规模', '职位名称', '所在地址',
                                       '薪资范围', '薪资制度', '所需学历', '所需技能', '所需经验',
                                       '沟通HR', '发布时间', '公司福利'])
    cd.writeheader()

    num = 10
    for page in range(1, num + 1):
        try:
            data_load = google.listen.wait(timeout=2)
        except TimeoutError:
            print('Overtime')
            exit(0)
        else:
            if data_load:
                print(f'正在处理第 {page} 页数据……')
                # 获取响应数据(字典)
                data_response = data_load.response.body
                # 处理数据
                deal_with(data_response, cd)
                if page < num:
                    # 滑动页面
                    google.scroll.to_bottom()
                    # 定位下一页按钮并点击
                    button = google('下一页')
                    button.run_js('this.click();')
            else:
                print('No more information!')
                exit(1)


if __name__ == '__main__':
    main()

运行结果展示:

二、网页爬虫 — Requests 模块

1. 得物(JS 逆向加密)

js_file.js 模块代码:【对得物进行爬虫时使用到的 js 模块

Python 代码:

import requests
import pandas as pd
# 导入编译 js 代码模块
import execjs
# ------------------------------------------------
import openpyxl
from openpyxl.drawing.image import Image as xlImage
from openpyxl.utils import get_column_letter
from PIL import Image
from io import BytesIO


def get_data_xlsx(js_path, save_path):
    # 请求标头
    request_header = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br, zstd',
        'accept-language': 'zh-CN,zh;q=0.9',
        'connection': 'keep-alive',
        'content-length': '124',
        'content-type': 'application/json',
        'cookie': '...',
        'host': 'app.dewu.com',
        'ltk': '...',
        'origin': 'https://www.dewu.com',
        'referer': 'https://www.dewu.com/',
        'sec-ch-ua': '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'sessionid': '...',
        'shumeiid': '...',
        'sk': '',
        'traceparent': '...',
        'user-agent': '...'
    }
    # 请求网址
    request_url = r'https://app.dewu.com/api/v1/h5/commodity-pick-interfaces/pc/pick-rule-result/feeds/info'
    # 请求载荷
    request_parameters = {
        'filterUnbid': True,
        'pageNum': 1,  # 页码
        'pageSize': 24,
        'pickRuleId': 644443,  # 类目 ID
        'showCspu': True
    }

    # 编译 js 代码
    js_code = execjs.compile(open(js_path, encoding='utf-8').read())
    # 获取 sign 加密参数
    sign_data = js_code.call('c', request_parameters)
    # 0e5d10fb111f2afef6ac0a1776187e23
    # 将 sign 添加至请求载荷中
    request_parameters['sign'] = sign_data

    print('Data is being requested and processed…')
    # 请求数据
    response = requests.post(url=request_url, json=request_parameters, headers=request_header)
    # 获取数据
    data_json = response.json()
    # 创建一个空列表
    dewu_info = []
    # 解析数据
    info_list = data_json['data']['list']
    for index in info_list:
        info_dict = {
            '标题': index['title'],
            '价格': index['price'] / 100,
            '图片网址': index['logoUrl']
        }
        # 写入数据
        dewu_info.append(info_dict)
    # 转换数据
    df = pd.DataFrame(dewu_info)
    # 导出保存为 Excel 表格
    df.to_excel(save_path, index=False)
    print(f'The data is already saved in {save_path}')


def download_image(url):
    rg_url = requests.get(url)
    # 检查响应状态码
    if rg_url.status_code == 200:
        # 创建图像对象
        image = Image.open(BytesIO(rg_url.content))
        # 统一图像类型
        if image.mode != 'RGB':
            image = image.convert('RGB')
        # 调整图像大小
        return image.resize((150, 96))
    else:
        raise Exception(f"Unable to download images, status codes: {rg_url.status_code}")


def link_to_png(source_path, destination_path):
    # 加载 Excel 文件
    wb = openpyxl.load_workbook(source_path)
    # 默认为第一个 sheet
    sheet = wb.active
    # 调整行高和列宽
    for row in range(2, sheet.max_row + 1):
        sheet.row_dimensions[row].height = 75
    sheet.column_dimensions['C'].width = 20

    # 读取链接并下载图片插入到对应位置
    for row in range(2, sheet.max_row + 1):
        # 假设图片链接在第 2 行开始,第 C 列是链接(对应 column = 3),获取链接单元格的值
        link = sheet.cell(row=row, column=3).value
        # 清空内容
        sheet.cell(row=row, column=3).value = None
        # 如果链接不为空
        if link:
            # 发送 HTTP 请求下载图片
            try:
                # 尝试下载图像
                resized_image = download_image(link)
            except OSError:
                print(f"Failed to download image {link}")
                continue
            else:
                # 将调整后的图像插入到工作表中
                img_bytes = BytesIO()
                resized_image.save(img_bytes, format='PNG')  # 将图片保存到内存中
                img = xlImage(img_bytes)
                sheet.add_image(img, f'{get_column_letter(3)}{row}')  # 插入图片到指定位置

    wb.save(destination_path)  # 必要
    wb.close()  # 必要


if __name__ == '__main__':
    j_path = './js_file.js'
    s_path = './dewu_link.xlsx'
    # 获取数据并保存为 Excel 文件
    get_data_xlsx(j_path, s_path)
    d_path = './dewu_png.xlsx'
    print('Excel file is being processed…')
    link_to_png(s_path, d_path)
    print(f'The data is already saved in {d_path}')

运行结果展示:

2. 闲鱼(MD5 加密)

Python 代码:

# 导入数据请求模块
import requests
import csv
# 导入哈希模块
import hashlib
import time


def get_sign(page):
    d_token = '...'  # d_token 具有时效性,自行填写
    j = int(time.time() * 1000)
    h = '34839810'
    c_data = ('{"pageNumber": %d, '
              '"keyword": "python爬虫书籍", '
              '"fromFilter": false, '
              '"rowsPerPage": 30, '
              '"sortValue": "", '
              '"sortField": "", '
              '"customDistance": "", '
              '"gps": "", '
              '"propValueStr": {}, '
              '"customGps": "", '
              '"searchReqFromPage": "pcSearch", '
              '"extraFilterValue": "{}", '
              '"userPositionJson": "{}"}') % page
    result_str = d_token + "&" + str(j) + "&" + h + "&" + c_data
    # 使用 md5 加密
    md_str = hashlib.md5()
    # 传入加密参数
    md_str.update(result_str.encode('utf-8'))
    # 进行加密处理
    sign = md_str.hexdigest()
    return sign, j, c_data


def get_data_csv(file_path, head_name):
    # 模拟浏览器(请求标头)
    request_header = {
        'Referer': 'https://www.goofish.com/',
        # cookie 代表用户信息,常用于检测是否有登陆账户(不论是否登录都有 cookie)
        # cookie 具有时效性,自行填写
        'Cookie': '...',
        # user-agent 代表用户代理,显示浏览器 / 设备的基本身份信息
        'User-Agent': '...'
    }
    # 请求网址
    request_url = r'https://h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search/1.0/'
    # 创建文件对象
    f = open(file_path, mode='a', encoding='utf-8-sig', newline='')
    # 字典写入方法
    cd = csv.DictWriter(f, fieldnames=head_name)
    cd.writeheader()

    # for 构建循环翻页
    num = 10
    for i in range(1, num + 1):
        print(f'正在采集第 {i} 页数据…')
        # 获取 sign 加密参数、时间戳和表单数据
        sign, j_time, c_data = get_sign(i)
        # 查询参数
        query_parameters = {
            'jsv': '2.7.2',
            'appKey': '34839810',
            't': str(j_time),
            'sign': sign,
            'v': '1.0',
            'type': 'originaljson',
            'accountSite': 'xianyu',
            'dataType': 'json',
            'timeout': '20000',
            'api': 'mtop.taobao.idlemtopsearch.pc.search',
            'sessionOption': 'AutoLoginOnly',
            'spm_cnt': 'a21ybx.search.0.0',
            'spm_pre': 'a21ybx.home.searchSuggest.1.4c053da6IXTxSx',
            'log_id': '4c053da6IXTxSx'
        }
        # 表单数据
        form_data = {"data": c_data}
        # 发送请求
        response = requests.post(url=request_url, params=query_parameters, data=form_data, headers=request_header)

        # 获取响应的 json 数据 → 字典数据类型
        data_json = response.json()
        # 键值对取值,提取商品信息所在列表
        info_list = data_json['data']['resultList']
        # for 循环遍历,提取列表里的元素
        for index in info_list:
            # 处理用户名
            nick_name = '未知'
            if 'userNickName' in index['data']['item']['main']['exContent']:
                nick_name = index['data']['item']['main']['exContent']['userNickName']
            # 处理售价
            price_list = index['data']['item']['main']['exContent']['price']
            price = ''
            for p in price_list:
                price += p['text']
            # 处理详情页链接
            item_id = index['data']['item']['main']['exContent']['itemId']
            link = f'https://www.goofish.com/item?id={item_id}'

            temporarily_dict = {
                '标题': index['data']['item']['main']['exContent']['title'],
                '地区': index['data']['item']['main']['exContent']['area'],
                '售价': price,
                '用户名': nick_name,
                '详情页链接': link
            }
            cd.writerow(temporarily_dict)
    f.close()


if __name__ == '__main__':
    f_path = './fish_python.csv'
    h_name = ['标题', '地区', '售价', '用户名', '详情页链接']
    get_data_csv(f_path, h_name)

运行结果展示:


网站公告

今日签到

点亮在社区的每一天
去签到