一、要求
利用给定中文语料,分别采用互信息、双字耦合度、jieba库等进行分词,计算准确率、召回率、F1值。分析不同方法优劣,了解jieba库分词原理。
二、内容
2.1 数据集准备
待分词文件train.txt文件如下图图1所示。
图 1 train.txt
已分词的标准文件如下图图2所示。
图 2
2.2 代码
步骤一:数据加载,通过try_decode函数解决中文编码问题,通过load_data函数加载原始文本和标准分词结果(和测试结果进行对比)。
步骤二:使用以下三种方法进行分词测试。
a.jieba_segment 直接调用jieba分词库进行分词
b.mutual_information_segment基于互信息进行分词
互信息的计算公式为:MI (w1,w2) = log (P (w1w2)/(P (w1)*P (w2)))
c.coupling_degree_segment 使用双字耦合度分词算法
双字耦合度计算公式为:CD (w1,w2) = P (w1w2) /max (P_left (w1), P_right (w2))
步骤三:通过evaluate函数计算三种方法的准确率、召回率、F1值。
步骤四:使用matplotlib将结果进行可视化。
import os
import math
from collections import Counter, defaultdict
import jieba
def try_decode(file_path, encodings=['utf-8', 'gbk', 'gb18030', 'latin1']):
"""尝试不同编码打开文件"""
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return [line.strip() for line in f if line.strip()]
except UnicodeDecodeError:
continue
raise UnicodeDecodeError(f"无法用以下编码打开文件:{encodings}")
def load_data(raw_file, gold_file):
raw_lines = try_decode(raw_file)
gold_segments = try_decode(gold_file)
return raw_lines, gold_segments
def evaluate(predicted, true):
""" 计算准确率、召回率、F1值 """
tp = len(set(predicted).intersection(true))
fp = len(predicted) - tp
fn = len(true) - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return precision, recall, f1
def jieba_segment(texts):
""" 使用 jieba 分词"""
segmented = []
for text in texts:
segmented.extend(jieba.lcut(text))
return segmented
def build_bigram_freq(texts):
char_counter = Counter()
bigram_counter = Counter()
for text in texts:
for i in range(len(text)):
char_counter[text[i]] += 1
if i < len(text) - 1:
bigram = text[i] + text[i+1]
bigram_counter[bigram] += 1
total_chars = sum(char_counter.values())
total_bigrams = sum(bigram_counter.values())
char_freq = {k: v / total_chars for k, v in char_counter.items()}
bigram_freq = {k: v / total_bigrams for k, v in bigram_counter.items()}
return char_freq, bigram_freq
def mutual_information_segment(text, char_freq, bigram_freq, threshold=0.001):
""" 使用互信息进行分词 """
segments = []
i = 0
while i < len(text):
found = False
if i + 1 < len(text):
bigram = text[i] + text[i+1]
p_w1 = char_freq.get(text[i], 1e-8)
p_w2 = char_freq.get(text[i+1], 1e-8)
p_bigram = bigram_freq.get(bigram, 1e-8)
mi = math.log(p_bigram / (p_w1 * p_w2)) if p_w1 > 0 and p_w2 > 0 else 0
if mi > threshold:
segments.append(bigram)
i += 2
found = True
if not found:
segments.append(text[i])
i += 1
return segments
def mutual_information_all(texts, char_freq, bigram_freq, threshold=0.001):
"""对所有文本使用互信息分词"""
result = []
for text in texts:
result.extend(mutual_information_segment(text, char_freq, bigram_freq, threshold))
return result
def coupling_degree_segment(text, bigram_freq, threshold=0.0005):
""" 使用双字耦合度进行分词 """
segments = []
i = 0
while i < len(text):
found = False
if i + 1 < len(text):
bigram = text[i] + text[i+1]
left_count = sum(1 for t in text if t == text[i])
right_count = sum(1 for t in text if t == text[i+1])
p_bigram = bigram_freq.get(bigram, 1e-8)
cd = p_bigram / max(left_count, right_count) if max(left_count, right_count) > 0 else 0
if cd > threshold:
segments.append(bigram)
i += 2
found = True
if not found:
segments.append(text[i])
i += 1
return segments
def coupling_degree_all(texts, bigram_freq, threshold=0.0005):
"""对所有文本使用双字耦合度分词"""
result = []
for text in texts:
result.extend(coupling_degree_segment(text, bigram_freq))
return result
def main():
# 加载数据
raw_file = 'D:/00NLP/实验语料库25/实验5/test.txt'
gold_file = 'D:/00NLP/实验语料库25/实验2、3/chn.txt'
raw_texts, gold_segments = load_data(raw_file, gold_file)
flat_gold = [word for seg in gold_segments for word in seg]
char_freq, bigram_freq = build_bigram_freq(raw_texts)
# 方法一:jieba 分词
jieba_result = jieba_segment(raw_texts)
p_jieba, r_jieba, f1_jieba = evaluate(jieba_result, flat_gold)
print("\n【Jieba 分词】")
print(f"准确率: {p_jieba:.4f}, 召回率: {r_jieba:.4f}, F1值: {f1_jieba:.4f}")
# 方法二:互信息(MI)分词
mi_result = mutual_information_all(raw_texts, char_freq, bigram_freq)
p_mi, r_mi, f1_mi = evaluate(mi_result, flat_gold)
print("\n【互信息(MI)分词】")
print(f"准确率: {p_mi:.4f}, 召回率: {r_mi:.4f}, F1值: {f1_mi:.4f}")
# 方法三:双字耦合度(CD)分词
cd_result = coupling_degree_all(raw_texts, bigram_freq)
p_cd, r_cd, f1_cd = evaluate(cd_result, flat_gold)
print("\n【双字耦合度(CD)分词】")
print(f"准确率: {p_cd:.4f}, 召回率: {r_cd:.4f}, F1值: {f1_cd:.4f}")
# 通过图表对比
methods = ['Jieba', 'MI', 'CD']
precisions = [p_jieba, p_mi, p_cd]
recalls = [r_jieba, r_mi, r_cd]
f1_scores = [f1_jieba, f1_mi, f1_cd]
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
x = range(len(methods))
plt.figure(figsize=(8, 5))
plt.bar(x, precisions, width=0.2, label='准确率')
plt.bar([i + 0.2 for i in x], recalls, width=0.2, label='召回率')
plt.bar([i + 0.4 for i in x], f1_scores, width=0.2, label='F1值')
plt.xticks([i + 0.2 for i in x], methods)
plt.title('不同分词方法性能比较')
plt.legend()
plt.grid(True)
plt.show()
if __name__ == "__main__":
main()
2.3 结果截图
如图3、图4所示,图片中展示了三种分词方法的性能对比。
图 3 三种方法性能对比
图 4 性能对比可视化
三、总结
通过jieba分词、互信息分词、双字耦合度分词三种分词方法,并对比了三种方法的效果。
在中文分词中,双字耦合度用于衡量相邻两个字符之间的紧密程度,其核心思想是:如果两个字符经常一起出现,而各自作为其他字符的前缀或后缀的频率较低,则它们更可能构成一个词。
互信息是信息论中的概念,用于衡量两个事件之间的依赖程度。在中文分词中,互信息用于评估相邻字符之间的关联性。
Jieba分词库是一个成熟的分词工具,通常在通用领域有较高的准确率。它能够准确地识别大部分常见的词汇和短语。
从结果可以看到,三种方法召回率均较低。在识别出所有应该被识别的词这方面表现都欠佳,可能遗漏了很多本应正确切分出来的词。
改进部分可以调整互信息和双字耦合度分词方法的阈值,进一步优化效果。