使用 Puppeteer 库通过自动化浏览器来访问百度图片搜索,并在搜索结果中下载图片。代码分为两部分:
- 自动化浏览器任务:使用 Puppeteer 浏览百度图片搜索并获取图片 URL。
- 图片下载:检查图片 URL 类型(base64 或 URL),并保存图片到本地。
如果无法通过npm安装:
确保 Puppeteer 的下载地址已经指向淘宝镜像
set PUPPETEER_DOWNLOAD_HOST=https://npmmirror.com/mirrors
示例:
import puppeteer from 'puppeteer';
import http from "http";
import https from "https";
import fs from "fs";
import {promisify} from 'util';
import qs from "querystring";
import {v4} from "uuid";
(async () => {
const browser = await puppeteer.launch({
headless: false, // 打开浏览器
browser: "chrome",
slowMo: 250, // slow down by 250ms
// executablePath:'', // 其它浏览器打开地址
});
const page = await browser.newPage();
await page.goto('https://image.baidu.com/');
console.log('goto: https://image.baidu.com/');
await page.setViewport({width: 1920, height: 1080});
let count = 0;
while (count < 60) {
try {
await page.focus('[name="word"]');
break;
} catch (err) {
count++;
await page.reload();
console.log('[name="word"] selector not found, try again');
}
}
// 通过属性获取指定input,填入搜索框文字
await page.keyboard.sendCharacter('卡皮巴拉');
// 等于上面两条 page.focus page.keyboard.sendCharacter
// await page.type('#kw', '卡皮巴拉', { delay: 100 });
await page.click('.submit-btn_ZmEXZ');
// await page.reload();
page.on('load', async () => {
console.log('page loaded!');
await page.waitForSelector('.main_img');
// evaluate 中会在浏览器端运行js代码。
const src = await page.evaluate(() => {
const images = document.querySelectorAll('.main_img');
// 在puppeteer打开的浏览器查看
console.log(images);
// return images.map(img => img.src);
return Array.prototype.map.call(images, img => img.src);
});
await Promise.all(src.map(src => {
console.log(src);
return downloadImage(src, './image/p1/');
}));
await page.screenshot({
path: './screenshot.png',
});
await browser.close();
});
})();
const urlToImage = (url, dir, callback) => {
const mod = /^https:/.test(url) ? https : http;
const ext = qs.parse(url).f.split('?').shift().toLowerCase().replace('jpeg', 'jpg');
const file = `${dir}${v4()}.${ext}`;
return new Promise((resolve, reject) => {
mod.get(url, res => {
// 使用 fs.createWriteStream 创建文件流
const writeStream = fs.createWriteStream(file);
// 将响应数据流管道到文件写入流
res.pipe(writeStream);
// 在写入完成时调用回调
writeStream.on('finish', () => {
console.log(file);
resolve();
});
});
});
};
const base64ToImage = async (base64, dir) => {
try {
const matches = base64.match(/^data:(.+?);base64,(.+)$/);
const ext = matches[1].split('/')[1].replace('jpeg', 'jpg');
const file = `${dir}${v4()}.${ext}`;
await promisify(fs.writeFile)(file, matches[2], 'base64');
} catch (err) {
console.log(err);
}
}
async function downloadImage(src, dir) {
if (/data:(.+?);base64,(.+)/.test(src)) {
await base64ToImage(src, dir);
} else if (src.startsWith('http')) {
await urlToImage(src, dir);
} else {
console.log('[error] download fail, unsupported image type!', src);
}
}