【网站内容安全检测】之1:获取网站所有链接sitemap数据

发布于:2025-06-27 ⋅ 阅读:(15) ⋅ 点赞:(0)

不多BB,直接上代码:
main.go

package main

import (
    "bufio"
    "crypto/tls"
    "fmt"
    "io"
    "net/http"
    "net/url"
    "os"
    "strings"
    "sync"
    "time"
    _ "net/http/pprof"
    "log"
	

    "github.com/PuerkitoBio/goquery"
    "github.com/schollz/progressbar/v3"
)

type WebCrawler struct {
    startURLs     []string
    baseDomains   map[string]bool
    visitedURLs   sync.Map
    urlsToVisit   chan string
    semaphore     chan struct{}
    timeout       time.Duration
    verifySSL     bool
    client        *http.Client
    progressBar   *progressbar.ProgressBar
    wg            sync.WaitGroup
}

func NewWebCrawler(startURLs []string, maxConnections int, timeout int, verifySSL bool) *WebCrawler {
    baseDomains := make(map[string]bool)
    for _, u := range startURLs {
        parsed, _ := url.Parse(u)
        baseDomains[parsed.Host] = true
    }

    return &WebCrawler{
        startURLs:   startURLs,
        baseDomains: baseDomains,
        urlsToVisit: make(chan string, 1000),
        semaphore:   make(chan struct{}, maxConnections),
        timeout:     time.Duration(timeout) * time.Second,
        verifySSL:   verifySSL,
    }
}

func (c *WebCrawler) initClient() {
    tr := &http.Transport{
        TLSClientConfig: &tls.Config{InsecureSkipVerify: !c.verifySSL},
    }
    c.client = &http.Client{
        Timeout:   c.timeout,
        Transport: tr,
    }
}

func (c *WebCrawler) normalizeURL(rawURL string, baseURL string) (string, error) {
    base, err := url.Parse(baseURL)
    if err != nil || base == nil {
        return "", fmt.Errorf("invalid base URL: %v", err)
    }
    u, err := url.Parse(rawURL)
    if err != nil || u == nil {
        return "", fmt.Errorf("invalid URL: %v", err)
    }
    return base.ResolveReference(u).String(), nil
}

func (c *WebCrawler) isValidURL(rawURL string) bool {
    parsed, err := url.Parse(rawURL)
    if err != nil {
        return false
    }
    if parsed.Scheme != "http" && parsed.Scheme != "https" {
        return false
    }
    if !c.baseDomains[parsed.Host] {
        return false
    }
    extensions := []string{".jpg", ".jpeg", ".png", ".gif", ".pdf", ".zip"}
    for _, ext := range extensions {
        if strings.HasSuffix(strings.ToLower(parsed.Path), ext) {
            return false
        }
    }
    return true
}

func (c *WebCrawler) fetchURL(url string) (string, error) {
    c.semaphore <- struct{}{}
    defer func() { <-c.semaphore }()

    req, err := http.NewRequest("GET", url, nil)
    if err != nil {
        return "", fmt.Errorf("request creation failed: %v", err)
    }
    req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
    
    resp, err := c.client.Do(req)
    if err != nil {
        if strings.Contains(err.Error(), "no such host") {
            return "", fmt.Errorf("DNS lookup failed for %s", url)
        }
        return "", fmt.Errorf("request failed: %v", err)
    }
    defer resp.Body.Close()

    if resp.StatusCode != 200 {
        return "", fmt.Errorf("non-200 status: %d", resp.StatusCode)
    }
    
    if !strings.Contains(resp.Header.Get("Content-Type"), "text/html") {
        return "", fmt.Errorf("non-HTML content type: %s", resp.Header.Get("Content-Type"))
    }

    body, err := io.ReadAll(resp.Body)
    if err != nil {
        return "", fmt.Errorf("error reading response: %v", err)
    }
    return string(body), nil
}

func (c *WebCrawler) parseLinks(html string, baseURL string) []string {
    doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Printf("Error parsing HTML: %v", err)
        return nil
    }

    var links []string
    doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
        href, exists := s.Attr("href")
        if !exists || strings.HasPrefix(href, "javascript:") || href == "#" {
            return
        }
        normalized, err := c.normalizeURL(href, baseURL)
        if err != nil {
            log.Printf("Error normalizing URL %s: %v", href, err)
            return
        }
        if c.isValidURL(normalized) {
            links = append(links, normalized)
        }
    })

    doc.Find("[src]").Each(func(i int, s *goquery.Selection) {
        src, exists := s.Attr("src")
        if !exists || strings.HasPrefix(src, "data:") {
            return
        }
        normalized, err := c.normalizeURL(src, baseURL)
        if err != nil {
            log.Printf("Error normalizing URL %s: %v", src, err)
            return
        }
        if c.isValidURL(normalized) {
            links = append(links, normalized)
        }
    })

    return links
}

func (c *WebCrawler) processURL(url string) {
    defer c.wg.Done()

    if _, exists := c.visitedURLs.Load(url); exists {
        return
    }
    c.visitedURLs.Store(url, true)

    html, err := c.fetchURL(url)
    if err != nil {
        fmt.Printf("Error fetching %s: %v\n", url, err)
        return
    }

    newLinks := c.parseLinks(html, url)
    for _, link := range newLinks {
        if _, exists := c.visitedURLs.Load(link); !exists {
            c.urlsToVisit <- link
        }
    }

    if c.progressBar != nil {
        c.progressBar.Add(1)
    }
}

func (c *WebCrawler) crawl() {
    c.initClient()

    c.progressBar = progressbar.Default(-1, "爬取进度")
    defer c.progressBar.Close()

    for _, url := range c.startURLs {
        c.wg.Add(1)
        go c.processURL(url)
    }

    go func() {
        for newURL := range c.urlsToVisit {
            if _, exists := c.visitedURLs.Load(newURL); !exists {
                c.wg.Add(1)
                go c.processURL(newURL)
            }
        }
    }()

    c.wg.Wait()
}

func (c *WebCrawler) saveResults(filename string) {
    file, err := os.Create(filename)
    if err != nil {
        fmt.Printf("Error creating file: %v\n", err)
        return
    }
    defer file.Close()

    c.visitedURLs.Range(func(key, _ interface{}) bool {
        file.WriteString(key.(string) + "\n")
        return true
    })
}

func (c *WebCrawler) run() {
    startTime := time.Now()
    c.crawl()
    elapsed := time.Since(startTime)

    fmt.Printf("\n爬取完成!\n")
    // 修复语法错误:添加缺少的括号和逗号
    visitedCount := 0
    c.visitedURLs.Range(func(key, _ interface{}) bool {
        visitedCount++
        return true
    })
    fmt.Printf("共爬取 %d 个URL\n", visitedCount)
    fmt.Printf("用时: %.2f 秒\n", elapsed.Seconds())

    outputFile := "multi_domain_links.txt"
    c.saveResults(outputFile)
    fmt.Printf("结果已保存到 %s\n", outputFile)
}

func main() {
	go func() {
        log.Println(http.ListenAndServe("localhost:6060", nil))
    }()
    if len(os.Args) < 2 {
        fmt.Println("用法: go run web_crawler.go <URL文件路径> [verify_ssl]")
        fmt.Println("例如: go run web_crawler.go urls.txt")
        fmt.Println("或: go run web_crawler.go urls.txt true")
        return
    }

    urlFile := os.Args[1]
    file, err := os.Open(urlFile)
    if err != nil {
        fmt.Printf("错误:文件 %s 不存在\n", urlFile)
        return
    }
    defer file.Close()

    var startURLs []string
    scanner := bufio.NewScanner(file)
    for scanner.Scan() {
        if url := strings.TrimSpace(scanner.Text()); url != "" {
            startURLs = append(startURLs, url)
        }
    }

    if len(startURLs) == 0 {
        fmt.Println("错误:URL文件为空")
        return
    }

    
    verifySSL := false
    if len(os.Args) > 2 {
        verifySSL = os.Args[2] == "true"
    }

    crawler := NewWebCrawler(startURLs, 50, 20, verifySSL)
    
    // 添加开始运行提示
    fmt.Printf("开始爬取网站,起始URL数量: %d,是否验证SSL: %v\n", len(startURLs), verifySSL)
    
    crawler.run()
}

go.mod

module webcrawler

go 1.24.4

require (
	github.com/PuerkitoBio/goquery v1.10.3 // indirect
	github.com/andybalholm/cascadia v1.3.3 // indirect
	github.com/mattn/go-sqlite3 v1.14.28 // indirect
	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
	github.com/rivo/uniseg v0.4.7 // indirect
	github.com/schollz/progressbar/v3 v3.18.0 // indirect
	golang.org/x/net v0.39.0 // indirect
	golang.org/x/sys v0.32.0 // indirect
	golang.org/x/term v0.31.0 // indirect
)	

domains.txt

www.网址.com
www.网址2.com

运行命令

go run web_crawler.go .\domains.txt

结束后会自动将结果生成到当前目录中


网站公告

今日签到

点亮在社区的每一天
去签到