Go语言调用Chrome浏览器去进行截图的操作,对电脑的性能要求比较高,所以速度比较有限,但是目前来看这种方式可以最佳的去获取网页加载后的结果。
main.go
package main
import (
"context"
"errors"
"flag"
"fmt"
"io/ioutil"
"log"
"net/url"
"os"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/chromedp/chromedp"
)
// 任务结构
type Task struct {
URL string
Filename string
}
// 域名黑名单,包含关键字的域名将被跳过
var blacklist = []string{"edu.cn", "gov.cn"}
var (
totalTasks int64 // 总任务数
finishedTasks int64 // 已完成任务数
)
func main() {
start := time.Now()
defer func() {
if r := recover(); r != nil {
log.Printf("程序异常退出: %v", r)
}
}()
// 定义命令行参数,增加初始等待时间参数
urlFile := flag.String("urls", "urls.txt", "包含URL列表的文件路径")
outputDir := flag.String("output", "screenshots", "截图保存的目录")
workers := flag.Int("workers", 50, "并发工作线程数(建议1~3)")
width := flag.Int("width", 1280, "浏览器窗口宽度")
height := flag.Int("height", 800, "浏览器窗口高度")
fullPage := flag.Bool("full", false, "是否截取整个页面")
timeout := flag.Int("timeout", 20, "每个任务的超时时间(秒,建议大于页面加载等待时间,默认120)")
retry := flag.Int("retry", 3, "失败重试次数")
initialWait := flag.Int("initialWait", 1, "初始等待时间(秒),用于分散任务启动")
flag.Parse()
// 确保timeout参数合理
if *timeout <= 30 {
log.Printf("警告:timeout参数过小,已自动调整为60秒以避免context canceled错误!")
*timeout = 60
}
// 创建输出目录
if _, err := os.Stat(*outputDir); os.IsNotExist(err) {
if err := os.MkdirAll(*outputDir, 0755); err != nil {
log.Fatalf("创建输出目录失败: %v", err)
}
}
// 读取URL列表
urls, err := readURLs(*urlFile)
if err != nil {
log.Fatalf("读取URL文件失败: %v", err)
}
if len(urls) == 0 {
log.Fatal("URL列表为空")
}
// 统计总任务数
totalTasks = int64(len(urls))
// 创建任务通道,增加缓冲大小
taskCh := make(chan Task, len(urls))
// 填充任务通道
go func() {
for _, url := range urls {
// 生成文件名
filename := generateFilename(url, *outputDir)
taskCh <- Task{URL: url, Filename: filename}
}
close(taskCh)
}()
// 创建等待组
var wg sync.WaitGroup
// 启动进度监控协程
go func() {
startTime := time.Now()
for {
done := atomic.LoadInt64(&finishedTasks)
total := totalTasks
elapsed := time.Since(startTime).Seconds()
var speed float64 = 0
if elapsed > 0 {
speed = float64(done) / elapsed
}
remain := 0.0
if speed > 0 {
remain = float64(total-done) / speed
}
percent := float64(done) / float64(total) * 100
fmt.Printf("\r进度: %d/%d (%.2f%%) | 速度: %.2f/秒 | 已用: %.0fs | 预计剩余: %.0fs",
done, total, percent, speed, elapsed, remain)
if done >= total {
fmt.Println()
break
}
time.Sleep(1 * time.Second)
}
}()
// 启动工作线程,增加启动间隔
log.Printf("开始处理 %d 个URL,使用 %d 个工作线程\n", len(urls), *workers)
for i := 0; i < *workers; i++ {
// 增加启动间隔,避免同时启动过多线程
time.Sleep(time.Duration(i*(*initialWait)) * time.Second)
wg.Add(1)
go func(workerID int) {
defer wg.Done()
processTasks(workerID, taskCh, *width, *height, *fullPage, *timeout, *retry)
}(i)
}
// 等待所有工作线程完成
wg.Wait()
elapsed := time.Since(start)
log.Printf("所有任务完成,耗时: %s\n", elapsed)
}
// 读取URL文件
func readURLs(filePath string) ([]string, error) {
data, err := ioutil.ReadFile(filePath)
if err != nil {
return nil, err
}
// 按行分割URL
var urls []string
lines := strings.Split(string(data), "\n")
for _, line := range lines {
if url := strings.TrimSpace(line); url != "" {
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
url = "https://" + url
}
// 检查黑名单
blacklisted := false
for _, keyword := range blacklist {
if strings.Contains(url, keyword) {
blacklisted = true
break
}
}
if blacklisted {
continue
}
urls = append(urls, url)
}
}
return urls, nil
}
// 生成文件名
func generateFilename(urlStr, outputDir string) string {
// 移除URL中的协议部分
u, err := url.Parse(urlStr)
if err != nil {
// 如果解析失败,使用时间戳作为文件名
return filepath.Join(outputDir, fmt.Sprintf("unknown_%d.png", time.Now().UnixNano()))
}
// 使用主机名和路径生成文件名
filename := strings.ReplaceAll(u.Host+u.Path, "/", "_")
if len(filename) > 100 {
filename = filename[:100]
}
return filepath.Join(outputDir, filename+".png")
}
// 处理任务
func processTasks(workerID int, taskCh <-chan Task, width, height int, fullPage bool, timeout, retry int) {
// 优化Chrome选项,增加更多反检测设置
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.Flag("headless", false),
chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"),
chromedp.Flag("disable-blink-features", "AutomationControlled"),
chromedp.Flag("disable-web-security", true), // 禁用Web安全策略
chromedp.Flag("allow-running-insecure-content", true), // 允许运行不安全内容
chromedp.Flag("ignore-certificate-errors", true), // 忽略SSL证书错误
chromedp.WindowSize(width, height),
chromedp.Flag("no-sandbox", true), // 禁用沙盒模式,在某些环境需要
chromedp.Flag("disable-setuid-sandbox", true), // 禁用setuid沙盒
)
// 每个worker只启动一个Chrome实例
allocCtx, allocCancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer allocCancel()
parentCtx, parentCancel := chromedp.NewContext(allocCtx)
defer parentCancel()
for task := range taskCh {
var success bool
var attempt int
for attempt = 1; attempt <= retry; attempt++ {
log.Printf("工作线程 %d 正在处理 %s (尝试 %d/%d)\n", workerID, task.URL, attempt, retry)
if attempt > 1 {
time.Sleep(time.Duration(attempt*2) * time.Second)
}
// 每个任务新建tab
ctx, cancel := chromedp.NewContext(parentCtx)
err := captureScreenshot(ctx, task.URL, fullPage, timeout, task.Filename)
cancel()
if err == nil {
log.Printf("工作线程 %d 成功保存截图: %s\n", workerID, task.Filename)
success = true
break
}
log.Printf("工作线程 %d 处理 %s 失败: %v (尝试 %d/%d)\n", workerID, task.URL, err, attempt, retry)
if err != nil && (strings.Contains(err.Error(), "ERR_NAME_NOT_RESOLVED") ||
strings.Contains(err.Error(), "context canceled")) {
log.Printf("域名未被解析,停止对此URL的重试: %s", task.URL)
break
}
}
// 每个任务完成后,finishedTasks++
atomic.AddInt64(&finishedTasks, 1)
if !success {
log.Printf("工作线程 %d 处理 %s 失败,已达到最大重试次数\n", workerID, task.URL)
f, err := os.OpenFile("failed_urls.txt", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err == nil {
f.WriteString(task.URL + "\n")
f.Close()
}
}
}
}
// 截图函数,优化等待策略
func captureScreenshot(ctx context.Context, url string, fullPage bool, timeout int, outputPath string) error {
// 设置更长的超时
ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
defer cancel()
var buf []byte
err := chromedp.Run(ctx, chromedp.Tasks{
chromedp.Navigate(url),
chromedp.ActionFunc(func(ctx context.Context) error {
var readyState string
start := time.Now()
for {
err := chromedp.Evaluate(`document.readyState`, &readyState).Do(ctx)
if err != nil {
return err
}
if readyState == "complete" {
time.Sleep(2 * time.Second) // 页面加载完成后再等2秒
return nil
}
if time.Since(start) > time.Duration(timeout)*time.Second {
return errors.New("等待页面加载超时")
}
time.Sleep(500 * time.Millisecond)
}
}),
chromedp.FullScreenshot(&buf, 95), // 提高截图质量
})
if err != nil {
if errors.Is(err, context.DeadlineExceeded) {
log.Printf("截图超时(context deadline exceeded):%s", url)
} else if errors.Is(err, context.Canceled) {
log.Printf("截图被取消(context canceled):%s", url)
} else {
log.Printf("截图失败: %s, 错误: %+v", url, err)
}
return err
}
return ioutil.WriteFile(outputPath, buf, 0644)
}
go.mod
module screenshot-tool
go 1.24.4
require (
github.com/chromedp/cdproto v0.0.0-20250403032234-65de8f5d025b // indirect
github.com/chromedp/chromedp v0.13.7 // indirect
github.com/chromedp/sysutil v1.1.0 // indirect
github.com/go-json-experiment/json v0.0.0-20250211171154-1ae217ad3535 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
golang.org/x/sys v0.29.0 // indirect
)
运行命令:
go run main.go