解决场景
1、解决搜索时通过中文分词器拿到结果然后匹配是否命中了敏感词库
maven依赖
<properties>
<java.version>17</java.version>
<huban.version>1.0.2</huban.version>
</properties>
<!-- https://github.com/huaban/jieba-analysis -->
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>${huban.version}</version>
</dependency>
SEARCH模式
(适合 查询阶段(保证搜索召回)) 适合搜索使用
/**
* 原句: 我来到北京清华大学
* 分词结果:
* word=我, start=0, end=1
* word=来到, start=1, end=3
* word=北京, start=3, end=5
* word=清华大学, start=5, end=9
*/
@Test
public void testSegModeSearch() {
JiebaSegmenter segmenter = new JiebaSegmenter();
// String sentence = "我来到北京清华大学";
String sentence = "小明硕士毕业于中国科学院计算所";
System.out.println("原句: " + sentence);
// 精确搜索模式
List<SegToken> tokens = segmenter.process(sentence, JiebaSegmenter.SegMode.SEARCH);
System.out.println("分词结果:");
for (SegToken token : tokens) {
System.out.printf("word=%s, start=%d, end=%d%n",
token.word, token.startOffset, token.endOffset);
}
}
INDEX模式
(适合 索引阶段(构建倒排索引)。) 模式 适合构建索引 因为会把长词更加细分
/**
分词结果:
word=小明, start=0, end=2
word=硕士, start=2, end=4
word=毕业, start=4, end=6
word=于, start=6, end=7
word=中国, start=7, end=9
word=科学, start=9, end=11
word=学院, start=10, end=12
word=科学院, start=9, end=12
word=中国科学院, start=7, end=12
word=计算, start=12, end=14
word=计算所, start=12, end=15
*/
@Test
public void testSegModeIndex() {
JiebaSegmenter segmenter = new JiebaSegmenter();
String sentence = "小明硕士毕业于中国科学院计算所";
List<SegToken> tokens = segmenter.process(sentence, JiebaSegmenter.SegMode.INDEX);
System.out.println("分词结果:");
for (SegToken token : tokens) {
System.out.printf("word=%s, start=%d, end=%d%n",
token.word, token.startOffset, token.endOffset);
}
}
使用自定义词库
dicts/jieba.dict
中国科学院计算所 3 ns
/**
* https://www.cnblogs.com/xuchen163/p/13444973.html?utm_source=chatgpt.com
* 使用自定义词库
*/
@Test
public void testSegMode() {
JiebaSegmenter segmenter = new JiebaSegmenter();
String sentence = "小明硕士毕业于中国科学院计算所";
String resultStr = segmenter.process(sentence, JiebaSegmenter.SegMode.INDEX).toString();
System.out.println("-------------------切到的词:"+resultStr);
Path path = Paths.get(new File( getClass().getClassLoader().getResource("dicts/jieba.dict").getPath() ).getAbsolutePath() ) ;
//加载自定义的词典进词库
WordDictionary.getInstance().loadUserDict( path ) ;
//重新分词
segmenter = new JiebaSegmenter();
System.out.println(segmenter.process( sentence , JiebaSegmenter.SegMode.INDEX).toString());
System.out.printf(resultStr);
}