使用 Puppeteer 快速上手 Node.js 爬虫

发布于:2024-12-18 ⋅ 阅读:(40) ⋅ 点赞:(0)

使用 Puppeteer 库通过自动化浏览器来访问百度图片搜索,并在搜索结果中下载图片。代码分为两部分:

  1. 自动化浏览器任务:使用 Puppeteer 浏览百度图片搜索并获取图片 URL。
  2. 图片下载:检查图片 URL 类型(base64 或 URL),并保存图片到本地。

如果无法通过npm安装:

确保 Puppeteer 的下载地址已经指向淘宝镜像
set PUPPETEER_DOWNLOAD_HOST=https://npmmirror.com/mirrors 

示例: 

import puppeteer from 'puppeteer';
import http from "http";
import https from "https";
import fs from "fs";
import {promisify} from 'util';
import qs from "querystring";
import {v4} from "uuid";


(async () => {
    const browser = await puppeteer.launch({
        headless: false, // 打开浏览器
        browser: "chrome",
        slowMo: 250, // slow down by 250ms
        // executablePath:'', // 其它浏览器打开地址
    });

    const page = await browser.newPage();

    await page.goto('https://image.baidu.com/');
    console.log('goto: https://image.baidu.com/');

    await page.setViewport({width: 1920, height: 1080});

    let count = 0;
    while (count < 60) {
        try {
            await page.focus('[name="word"]');
            break;
        } catch (err) {
            count++;
            await page.reload();
            console.log('[name="word"] selector not found, try again');
        }
    }

    // 通过属性获取指定input,填入搜索框文字
    await page.keyboard.sendCharacter('卡皮巴拉');

    // 等于上面两条 page.focus page.keyboard.sendCharacter
    // await page.type('#kw', '卡皮巴拉', { delay: 100 });

    await page.click('.submit-btn_ZmEXZ');

    // await page.reload();

    page.on('load', async () => {
        console.log('page loaded!');

        await page.waitForSelector('.main_img');

        // evaluate 中会在浏览器端运行js代码。
        const src = await page.evaluate(() => {
            const images = document.querySelectorAll('.main_img');
            // 在puppeteer打开的浏览器查看
            console.log(images);
            // return images.map(img => img.src);
            return Array.prototype.map.call(images, img => img.src);
        });

        await Promise.all(src.map(src => {
            console.log(src);
            return downloadImage(src, './image/p1/');
        }));

        await page.screenshot({
            path: './screenshot.png',
        });

        await browser.close();
    });
})();

const urlToImage = (url, dir, callback) => {
    const mod = /^https:/.test(url) ? https : http;
    const ext = qs.parse(url).f.split('?').shift().toLowerCase().replace('jpeg', 'jpg');
    const file = `${dir}${v4()}.${ext}`;

    return new Promise((resolve, reject) => {
        mod.get(url, res => {
            // 使用 fs.createWriteStream 创建文件流
            const writeStream = fs.createWriteStream(file);

            // 将响应数据流管道到文件写入流
            res.pipe(writeStream);

            // 在写入完成时调用回调
            writeStream.on('finish', () => {
                console.log(file);
                resolve();
            });
        });
    });
};

const base64ToImage = async (base64, dir) => {
    try {
        const matches = base64.match(/^data:(.+?);base64,(.+)$/);
        const ext = matches[1].split('/')[1].replace('jpeg', 'jpg');
        const file = `${dir}${v4()}.${ext}`;

        await promisify(fs.writeFile)(file, matches[2], 'base64');
    } catch (err) {
        console.log(err);
    }
}

async function downloadImage(src, dir) {
    if (/data:(.+?);base64,(.+)/.test(src)) {
        await base64ToImage(src, dir);
    } else if (src.startsWith('http')) {
        await urlToImage(src, dir);
    } else {
        console.log('[error] download fail, unsupported image type!', src);
    }
}


网站公告

今日签到

点亮在社区的每一天
去签到