python网络爬取个人学习指南-(五)

发布于:2025-09-10 ⋅ 阅读:(19) ⋅ 点赞:(0)

**************************************************************************************************************

author:keyinfo

date:2025-09-09 23:50

title:网络信息爬取之多联级标题内容点击

**************************************************************************************************************

(到此已经完成可以初步对页面多级链接进行爬取并且写入到文件里面了,接下来可以对防爬措施的绕过和内容再精细化筛选)

对AI的提问:

问:TypeError: 'dict' object is not callable错误
场景 1:误用圆括号 () 访问字典(最常见)
场景 2:变量名与字典类型 dict 重名
场景 3:字典方法调用错误(少了括号或多了括号)
场景 4:函数/变量名与字典混淆
总结:如何避免此错误?
访问字典键时用 [],而非 ():my_dict[key] 或 my_dict.get(key) 。
不要用 dict 作为变量名:避免覆盖内置类型。
区分对象和方法调用:方法调用需加 ()(如 my_dict.keys() ),而对象本身(如字典、列表)不能加 ()。
检查变量类型:如果不确定变量类型,可用 print(type(my_var)) 确认,避免将非函数对象当作函数调用。

源代码经由AI进行调优、添加上注释

核心代码:

代码段1:

        # 提取所有 <a> 标签中的 href 链接 
        for link in soup.find_all("a"): 
            href = link.get("href") 
 
            # 过滤无效链接
            if not href or href.startswith(("#",  "javascript:", "mailto:")):
                continue 
 
            # 拼接完整 URL 
            full_url = f"{url.rstrip('/')}/{href.lstrip('/')}" 
            collected_links.append(full_url) 

代码段2:

def save_links_to_file(links, file_path, buffer_size=1000):
    seen = set()
    buffer = []
 
    # 读取已有链接,防止重复写入
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            seen.update(line.strip()  for line in f)
    except FileNotFoundError:
        pass
 
    # 处理每个链接
    for link in links:
        if link in seen:
            continue
        seen.add(link) 
        buffer.append(f"{link}\n") 
 
        # 缓存满则写入文件
        if len(buffer) >= buffer_size:
            with open(file_path, "a", encoding="utf-8") as f:
                f.writelines(buffer) 
            buffer = []
 
    # 写入剩余链接 
    if buffer:
        with open(file_path, "a", encoding="utf-8") as f:
            f.writelines(buffer) 

完整代码:

import time
from urllib.request  import urlopen, Request
from urllib.parse  import urlparse 
from bs4 import BeautifulSoup
from urllib.error  import URLError, HTTPError
 
 
# 记录程序开始时间
start_time = time.perf_counter() 
 
 
def fetch_url(url, headers, collected_links):
    try:
        # 构建请求对象 
        req = Request(url, headers=headers)
 
        # 发起请求并读取响应内容
        with urlopen(req) as response:
            content = response.read() 
            status_code = response.getcode() 
 
        # 使用 BeautifulSoup 解析 HTML
        soup = BeautifulSoup(content, "lxml")
 
        # 提取所有 <a> 标签中的 href 链接 
        for link in soup.find_all("a"): 
            href = link.get("href") 
 
            # 过滤无效链接
            if not href or href.startswith(("#",  "javascript:", "mailto:")):
                continue 
 
            # 拼接完整 URL 
            full_url = f"{url.rstrip('/')}/{href.lstrip('/')}" 
            collected_links.append(full_url) 
 
        # 打印成功信息
        print(f"URL: {url}")
        print(f"Status: {status_code}")
        print(f"描述: {'成功' if status_code == 200 else '未知'}")
 
    except HTTPError as e:
        print(f"HTTP 错误: URL={url}, 状态码={e.code},  原因={e.reason}") 
    except URLError as e:
        print(f"URL 错误: URL={url}, 原因={e.reason}") 
    except Exception as e:
        print(f"请求失败: {e}")
 
    return collected_links
 
 
def extract_base_url(url):
    parsed = urlparse(url)
    base_url = f"{parsed.scheme}://{parsed.netloc}" 
    return base_url
 
 
def save_links_to_file(links, file_path, buffer_size=1000):
    seen = set()
    buffer = []
 
    # 读取已有链接,防止重复写入
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            seen.update(line.strip()  for line in f)
    except FileNotFoundError:
        pass
 
    # 处理每个链接
    for link in links:
        if link in seen:
            continue
        seen.add(link) 
        buffer.append(f"{link}\n") 
 
        # 缓存满则写入文件
        if len(buffer) >= buffer_size:
            with open(file_path, "a", encoding="utf-8") as f:
                f.writelines(buffer) 
            buffer = []
 
    # 写入剩余链接 
    if buffer:
        with open(file_path, "a", encoding="utf-8") as f:
            f.writelines(buffer) 
 
 
if __name__ == "__main__":
    # 用户输入
    input_url = input("请输入网址:")
    output_path = input("请输入保存路径(例如:./links.txt ):")
 
    # 提取基础 URL
    base_url = extract_base_url(input_url)
 
    # 设置请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Referer": base_url,
        "Host": base_url[8:] if base_url.startswith("https://")  else base_url[7:],
    }
 
    # 存储所有链接
    all_links = [base_url]
 
    # 获取页面链接
    fetch_url(base_url, headers, all_links)
 
    # 保存链接到文件
    save_links_to_file(all_links, output_path)
 
    # 打印运行时间
    end_time = time.perf_counter() 
    print(f"程序共计运行:{end_time - start_time:.4f}秒")


网站公告

今日签到

点亮在社区的每一天
去签到