数据清洗-敏感词

发布于:2024-04-01 ⋅ 阅读:(86) ⋅ 点赞:(0)
import os
import re
import json

def process_sensitive(input_folder, sensitive_file_path, output_folder):
    """替换文本中的敏感词并记录替换信息"""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    replace_info = []  # 用于记录替换信息

    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            with open(input_file_path, 'r', encoding='utf-8') as input_file, \
                    open(output_file_path, 'w', encoding='utf-8') as output_file:

                txt_content = input_file.read()

                # 加载敏感词文件
                with open(sensitive_file_path, 'r', encoding='utf-8') as sensitive_file_obj:
                    sensitive_words = {line.strip() for line in sensitive_file_obj if line.strip()}

                    # 使用正则表达式替换敏感词
                    for word in sensitive_words:
                        replace_count = len(re.findall(re.escape(word), txt_content, re.IGNORECASE))
                        if replace_count > 0:
                            # 获取敏感词前后三个字
                            match_pattern = r"(.{0,5})" + re.escape(word) + r"(.{0,5})"
                            matches = re.finditer(match_pattern, txt_content, re.IGNORECASE)
                            for match in matches:
                                before_word = match.group(1)
                                after_word = match.group(2)
                                replace_info.append({
                                    'file_name': filename,
                                    'sensitive_word': word,
                                    'txt_content': before_word + word + after_word
                                })

                            # 替换敏感词
                            txt_content = re.sub(re.escape(word), '*' * len(word), txt_content,
                                                 flags=re.IGNORECASE)

                # 将替换后的文本写入输出文件
                output_file.write(txt_content)

    # 将替换信息写入json文件
    replace_info_file = os.path.join(output_folder, 'replace_info.json')
    with open(replace_info_file, 'w', encoding='utf-8') as json_file:
        json.dump(replace_info, json_file, ensure_ascii=False, indent=4)

output_sensitive = 'D:\\2024work\\wangzhe\\清洗过程\\处理敏感词后的文件'
sensitive_file_path = 'D:\\2024work\\wangzhe\\清洗小说\\敏感词\\sensitive_words.txt'
process_sensitive("D:\\2024work\\wangzhe\\清洗过程\\处理关键词后的文件", sensitive_file_path, output_sensitive)

敏感词参考以下: 

GitHub - 57ing/Sensitive-word: 收集的一些敏感词汇,挺全的,还细分了暴恐词库、反动词库、民生词库、色情词库、贪腐词库、其他词库等

本文含有隐藏内容,请 开通VIP 后查看

网站公告

今日签到

点亮在社区的每一天
去签到