NLP---IF-IDF案例分析-EW帮帮网

一·案例 - 红楼梦

1首先准备语料库

http://www.dxsxs.com

这个网址去下载

2 任务一：拆分提取

import os
import re

def split_hongloumeng():
    # ========== 1. 配置路径（关键：根据实际文件位置修改） ==========
    # 脚本所在文件夹（自动获取，不用改）
    script_dir = os.path.dirname(os.path.abspath(__file__))
    # 红楼梦原文路径（和脚本同文件夹就不用改，否则写完整路径，如 D:/xxx/红楼梦.txt）
    input_path = os.path.join(script_dir, "红楼梦.txt")
    # 切割后保存的文件夹（自动在脚本目录创建“分卷”文件夹）
    output_dir = os.path.join(script_dir, "分卷")
    os.makedirs(output_dir, exist_ok=True)  # 确保输出文件夹存在

    # ========== 2. 正则规则（精准匹配要过滤/切割的内容） ==========
    # 过滤开头无关信息（手机电子书...本章字数:xxx）
    header_pattern = re.compile(r'手机电子书·大学生小说网.*?本章字数:\d+', re.DOTALL)
    # 匹配回目（第一回、第二回...），优先匹配“第X回”，适配不同写法
    chapter_pattern = re.compile(r'第([一二三四五六七八九十百千万]+回|[\d]+回)', re.UNICODE)
    # 过滤结尾无关内容（且听下回分解及之后空行）
    end_pattern = re.compile(r'且听下回分解.*?$', re.DOTALL)

    with open(input_path, 'r', encoding='utf-8') as f:
        # 读取全文 → 过滤开头无关内容 → 按行处理
        content = f.read()
        # 先砍头：去掉开头无关信息
        content = header_pattern.sub('', content).strip()
        # 按换行拆分，方便逐行处理
        lines = content.split('\n')

    current_chapter = None  # 当前回目名称（如“第一回”）
    current_lines = []      # 当前回的内容
    chapter_order = []      # 记录回目顺序，保证输出按1、2、3回排序

    # ========== 3. 逐行处理原文 ==========
    for line in lines:
        line = line.strip()  # 去掉每行首尾空格、换行符
        if not line:         # 空行直接跳过
            continue

        # 匹配回目（如“第一回”“第2回”，兼容中文数字和阿拉伯数字）
        chapter_match = chapter_pattern.search(line)
        if chapter_match:
            # ---- 遇到新回目，先保存上一回内容 ----
            if current_chapter:
                # 去结尾无关内容（且听下回分解...）
                clean_content = end_pattern.sub('', '\n'.join(current_lines)).strip()
                # 保存文件（用回目编号排序，如“001_第一回.txt”）
                output_path = os.path.join(
                    output_dir,
                    f"{str(len(chapter_order)+1).zfill(3)}_{current_chapter}.txt"
                )
                with open(output_path, 'w', encoding='utf-8') as f_out:
                    f_out.write(clean_content)
                chapter_order.append(current_chapter)  # 记录顺序

            # ---- 开始处理新回目 ----
            current_chapter = chapter_match.group(0)  # 提取回目名称（如“第一回”）
            current_lines = [current_chapter]  # 回目名称作为第一行

        else:
            # 非回目行，加入当前回内容（已过滤空行，直接存）
            current_lines.append(line)

    # ========== 4. 处理最后一回（循环外收尾） ==========
    if current_chapter:
        clean_content = end_pattern.sub('', '\n'.join(current_lines)).strip()
        output_path = os.path.join(
            output_dir,
            f"{str(len(chapter_order)+1).zfill(3)}_{current_chapter}.txt"
        )
        with open(output_path, 'w', encoding='utf-8') as f_out:
            f_out.write(clean_content)

    # ========== 5. 完成提示 ==========
    print(f"✅ 切割完成！共 {len(chapter_order) + (1 if current_chapter else 0)} 回")
    print(f"📁 保存位置：{output_dir}")
    print("🔍 文件名按【001_第一回、002_第二回...】排序，可直接用")

if __name__ == "__main__":
    split_hongloumeng()

任务二·把分好后的卷，转移成IF-IDF能识别的卷

#
import pandas as pd  # 数据预处理库
import os  # 用于文件和目录操作
import jieba  # 用于中文分词

# 获取当前脚本所在的目录路径
current_dir = os.path.dirname(os.path.abspath(__file__))

# 初始化列表，用于存储文件路径和文件内容
filePaths = []  # 保存文件路径
fileContents = []  # 保存文件路径对应的内容

# 遍历文件夹，获取文件路径和内容
# 使用绝对路径拼接，确保能正确找到分卷文件夹
fenjuan_dir = os.path.join(current_dir, "分卷")
for root, dirs, files in os.walk(fenjuan_dir):  # 遍历文件夹及其子文件夹
    for name in files:
        filePath = os.path.join(root, name)  # 拼接得到文件完整路径
        filePaths.append(filePath)  # 将文件路径添加到列表
        # 读取文件内容并添加到列表
        with open(filePath, 'r', encoding='utf-8') as f:
            fileContent = f.read()
            fileContents.append(fileContent)

# 将文件路径和内容转换为DataFrame
corpos = pd.DataFrame({
    'filePath': filePaths,
    'fileContent': fileContents
})

# 导入红楼梦专属词库，提升分词准确性
# 红楼梦词库与脚本在同一目录下
user_dict_path = os.path.join(current_dir, "红楼梦词库.txt")
jieba.load_userdict(user_dict_path)

# 读取停用词库，用于过滤无关词汇
# 修正路径，假设StopwordsCN.txt在当前脚本所在的红楼梦目录下
stopwords_path = os.path.join(current_dir, "StopwordsCN.txt")
stopwords = pd.read_csv(stopwords_path, encoding='utf8', engine='python', index_col=False)

# 创建新文件，用于保存分词后结果
output_file = os.path.join(current_dir, "wj.txt")
file_to_jieba = open(output_file, 'w', encoding='utf-8')

# 遍历DataFrame，对每个文件内容进行分词和停用词过滤
for index, row in corpos.iterrows():  # 按行遍历DataFrame
    juan_ci = ''  # 用于存储当前文件分词后的结果
    fileContent = row['fileContent']  # 获取当前文件内容
    segs = jieba.cut(fileContent)  # 进行分词
    for seg in segs:
        # 过滤停用词和空字符串
        if seg not in stopwords.stopword.values and len(seg.strip()) > 0:
            juan_ci += seg + ' '  # 拼接分词结果
    file_to_jieba.write(juan_ci + '\n')  # 将结果写入文件

# 关闭文件
file_to_jieba.close()

1. 导入所需库

python

运行

import pandas as pd  # 数据预处理库
import os  # 用于文件和目录操作
import jieba  # 用于中文分词

导入pandas库，用于数据的结构化处理（如创建 DataFrame）
导入os库，用于处理文件路径和目录遍历
导入jieba库，用于中文文本的分词处理

2. 获取当前脚本所在目录

python

运行

current_dir = os.path.dirname(os.path.abspath(__file__))

os.path.abspath(__file__)获取当前脚本的绝对路径
os.path.dirname()提取该路径中的目录部分
目的是获取可靠的基准路径，避免相对路径带来的问题

3. 初始化存储数据的列表

python

运行

filePaths = []  # 保存文件路径
fileContents = []  # 保存文件路径对应的内容

创建两个空列表，分别用于存储后续读取的文件路径和文件内容

4. 遍历文件夹并读取文件内容

python

运行

fenjuan_dir = os.path.join(current_dir, "分卷")
for root, dirs, files in os.walk(fenjuan_dir):
    for name in files:
        filePath = os.path.join(root, name)
        filePaths.append(filePath)
        with open(filePath, 'r', encoding='utf-8') as f:
            fileContent = f.read()
            fileContents.append(fileContent)

拼接得到 "分卷" 文件夹的完整路径
使用os.walk()遍历 "分卷" 文件夹下的所有文件
对每个文件，拼接完整路径并添加到filePaths列表
以 UTF-8 编码打开文件，读取内容并添加到fileContents列表

5. 创建数据框存储文件信息

python

运行

corpos = pd.DataFrame({
    'filePath': filePaths,
    'fileContent': fileContents
})

使用pandas.DataFrame()创建数据框
将文件路径和内容分别作为两列存储，便于后续按行处理

6. 加载自定义词库

python

运行

user_dict_path = os.path.join(current_dir, "红楼梦词库.txt")
jieba.load_userdict(user_dict_path)

拼接得到红楼梦专属词库的路径
加载自定义词库，让 jieba 分词更符合《红楼梦》的语言特点

7. 读取停用词库

python

运行

stopwords_path = os.path.join(current_dir, "StopwordsCN.txt")
stopwords = pd.read_csv(stopwords_path, encoding='utf8', engine='python', index_col=False)

拼接得到停用词文件的路径
读取中文停用词表，用于后续过滤无意义词汇（如 "的"、"了" 等）

8. 准备输出文件

python

运行

output_file = os.path.join(current_dir, "wj.txt")
file_to_jieba = open(output_file, 'w', encoding='utf-8')

定义分词结果的输出文件路径
以写入模式打开文件，准备存储处理后的结果

9. 分词处理并过滤停用词

python

运行

for index, row in corpos.iterrows():
    juan_ci = ''
    fileContent = row['fileContent']
    segs = jieba.cut(fileContent)
    for seg in segs:
        if seg not in stopwords.stopword.values and len(seg.strip()) > 0:
            juan_ci += seg + ' '
    file_to_jieba.write(juan_ci + '\n')

遍历数据框中的每一行（每个文件内容）
对文件内容进行分词处理
过滤掉停用词和空字符串
将处理后的分词结果拼接成字符串
写入到输出文件中

10. 关闭文件

python

运行

file_to_jieba.close()

完成写入后关闭文件，释放系统资源

NLP---IF-IDF案例分析

一·案例 - 红楼梦

1. 导入所需库

2. 获取当前脚本所在目录

3. 初始化存储数据的列表

4. 遍历文件夹并读取文件内容

5. 创建数据框存储文件信息

6. 加载自定义词库

7. 读取停用词库

8. 准备输出文件

9. 分词处理并过滤停用词

10. 关闭文件

网站公告

今日签到

热门文章

最新发布