Lucene本身提供了几个分词接口,我后来有给写了一个分词接口.
功能递增如下:
WhitespaceAnalyzer:仅仅是去除空格,对字符没有lowcase化,不支持中文
SimpleAnalyzer:功能强于WhitespaceAnalyzer,将除去letter之外的符号全部过滤掉,并且将所有的字符lowcase化,不支持中文
StopAnalyzer:StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基础上
增加了去除StopWords的功能,不支持中文
StandardAnalyzer:英文的处理能力同于StopAnalyzer.支持中文采用的方法为单字切分.
ChineseAnalyzer:来自于Lucene的sand box.性能类似于StandardAnalyzer,缺点是不支持中英文混和分词.
CJKAnalyzer:chedong写的CJKAnalyzer的功能在英文处理上的功能和StandardAnalyzer相同
但是在汉语的分词上,不能过滤掉标点符号,即使用二元切分
TjuChineseAnalyzer:我写的,功能最为强大.TjuChineseAnlyzer的功能相当强大,在中文分词方面由于其调用的为ICTCLAS的java接口.所以其在中文方面性能上同与ICTCLAS.其在英文分词上采用了Lucene的StopAnalyzer,可以去除 stopWords,而且可以不区分大小写,过滤掉各类标点符号.
程序调试于:JBuilder 2005
package org.apache.lucene.analysis;
//Author:zhangbufeng
//TjuAILab(天津大学人工智能实验室)
//2005.9.22.11:00
import java.io.*;
import junit.framework.*;
import org.apache.lucene.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.cn.*;
import org.apache.lucene.analysis.cjk.*;
import org.apache.lucene.analysis.tjucn.*;
import com.xjt.nlp.word.*;
public class TestAnalyzers extends TestCase {
public TestAnalyzers(String name) {
super(name);
}
public void assertAnalyzesTo(Analyzer a,
String input,
String[] output) throws Exception {
//前面的"dummy"好像没有用到
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
StringReader readerInput=new StringReader(input);
for (int i=0; i
Token t = ts.next();
//System.out.println(t);
assertNotNull(t);
//使用下面这条语句即可以输出Token的每项的text,并且用空格分开
System.out.print(t.termText);
System.out.print(" ");
assertEquals(t.termText(), output[i]);
}
System.out.println(" ");
assertNull(ts.next());
ts.close();
}
public void outputAnalyzer(Analyzer a ,String input) throws Exception{
TokenStream ts = a.tokenStream("dummy",new StringReader(input));
StringReader readerInput = new StringReader(input);
while(true){
Token t = ts.next();
if(t!=null){
System.out.print(t.termText);
System.out.print(" ");
}
else
break;
}
System.out.println(" ");
ts.close();
}
public void testSimpleAnalyzer() throws Exception {
//学习使用SimpleAnalyzer();
//SimpleAnalyzer将除去letter之外的符号全部过滤掉,并且将所有的字符lowcase化
Analyzer a = new SimpleAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR",
new String[] { "foo", "bar", "foo", "bar" });
assertAnalyzesTo(a, "foo bar . FOO <> BAR",
new String[] { "foo", "bar", "foo", "bar" });
assertAnalyzesTo(a, "foo.bar.FOO.BAR",
new String[] { "foo", "bar", "foo", "bar" });
assertAnalyzesTo(a, "U.S.A.",
new String[] { "u", "s", "a" });
assertAnalyzesTo(a, "C++",
new String[] { "c" });
assertAnalyzesTo(a, "B2B",
new String[] { "b", "b" });
assertAnalyzesTo(a, "2B",
new String[] { "b" });
assertAnalyzesTo(a, "\"QUOTED\" word",
new String[] { "quoted", "word" });
assertAnalyzesTo(a,"zhang ./ bu <> feng",
new String[]{"zhang","bu","feng"});
ICTCLAS splitWord = new ICTCLAS();
String result = splitWord.paragraphProcess("我爱共产党 i LOVE chanchan");
assertAnalyzesTo(a,result,
new String[]{"我","爱","共产党","i","love","chanchan"});
}
public void testWhiteSpaceAnalyzer() throws Exception {
//WhiterspaceAnalyzer仅仅是去除空格,对字符没有lowcase化
Analyzer a = new WhitespaceAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR",
new String[] { "foo", "bar", "FOO", "BAR" });
assertAnalyzesTo(a, "foo bar . FOO <> BAR",
new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
assertAnalyzesTo(a, "foo.bar.FOO.BAR",
new String[] { "foo.bar.FOO.BAR" });
assertAnalyzesTo(a, "U.S.A.",
new String[] { "U.S.A." });
assertAnalyzesTo(a, "C++",
new String[] { "C++" });
assertAnalyzesTo(a, "B2B",
new String[] { "B2B" });
assertAnalyzesTo(a, "2B",
new String[] { "2B" });
assertAnalyzesTo(a, "\"QUOTED\" word",
new String[] { "\"QUOTED\"", "word" });
assertAnalyzesTo(a,"zhang bu feng",
new String []{"zhang","bu","feng"});
ICTCLAS splitWord = new ICTCLAS();
String result = splitWord.paragraphProcess("我爱共产党 i love chanchan");
assertAnalyzesTo(a,result,
new String[]{"我","爱","共产党","i","love","chanchan"});
}
public void testStopAnalyzer() throws Exception {
//StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基础上
//增加了去除StopWords的功能
Analyzer a = new StopAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR",
new String[] { "foo", "bar", "foo", "bar" });
assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
new String[] { "foo", "bar", "foo", "bar" });
assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
new String[]{"foo","bar","foo","bar"});
ICTCLAS splitWord = new ICTCLAS();
String result = splitWord.paragraphProcess("我爱共产党 i Love chanchan such");
assertAnalyzesTo(a,result,
new String[]{"我","爱","共产党","i","love","chanchan"});
}
public void testStandardAnalyzer() throws Exception{
//StandardAnalyzer的功能最为强大,对于中文采用的为单字切分
Analyzer a = new StandardAnalyzer();
assertAnalyzesTo(a,"foo bar Foo Bar",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"张步峰是天大学生",
new String[]{"张","步","峰","是","天","大","学","生"});
//验证去除英文的标点符号
assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生",
new String[]{"张","步","峰","是","天","大","学","生"});
//验证去除中文的标点符号
assertAnalyzesTo(a,"张。、步。、峰是。天大。学生",
new String[]{"张","步","峰","是","天","大","学","生"});
}
public void testChineseAnalyzer() throws Exception{
//可见ChineseAnalyzer在功能上和standardAnalyzer的功能差不多,但是可能在速度上慢于StandardAnalyzer
Analyzer a = new ChineseAnalyzer();
//去空格
assertAnalyzesTo(a,"foo bar Foo Bar",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"张步峰是天大学生",
new String[]{"张","步","峰","是","天","大","学","生"});
//验证去除英文的标点符号
assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生",
new String[]{"张","步","峰","是","天","大","学","生"});
//验证去除中文的标点符号
assertAnalyzesTo(a,"张。、步。、峰是。天大。学生",
new String[]{"张","步","峰","是","天","大","学","生"});
//不支持中英文写在一起
// assertAnalyzesTo(a,"我爱你 i love chanchan",
/// new String[]{"我","爱","你","i","love","chanchan"});
}
public void testCJKAnalyzer() throws Exception {
//chedong写的CJKAnalyzer的功能在英文处理上的功能和StandardAnalyzer相同
//但是在汉语的分词上,不能过滤掉标点符号,即使用二元切分
Analyzer a = new CJKAnalyzer();
assertAnalyzesTo(a,"foo bar Foo Bar",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
new String[]{"foo","bar","foo","bar"});
assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
new String[]{"foo","bar","foo","bar"});
// assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生",
// new String[]{"张步","步峰","峰是","是天","天大","大学","学生"});
//assertAnalyzesTo(a,"张。、步。、峰是。天大。学生",
// new String[]{"张步","步峰","峰是","是天","天大","大学","学生"});
//支持中英文同时写
assertAnalyzesTo(a,"张步峰是天大学生 i love",
new String[]{"张步","步峰","峰是","是天","天大","大学","学生","i","love"});
}
public void testTjuChineseAnalyzer() throws Exception{
/**
* TjuChineseAnlyzer的功能相当强大,在中文分词方面由于其调用的为ICTCLAS的java接口.
* 所以其在中文方面性能上同与ICTCLAS.其在英文分词上采用了Lucene的StopAnalyzer,可以去除
* stopWords,而且可以不区分大小写,过滤掉各类标点符号.
*/
Analyzer a = new TjuChineseAnalyzer();
String input = "体育讯 在被尤文淘汰之后,皇马主帅博斯克拒绝接受媒体对球队后防线的批评,同时还为自己排出的首发阵容进行了辩护。"+
"“失利是全队的责任,而不仅仅是后防线该受指责,”博斯克说,“我并不认为我们踢得一塌糊涂。”“我们进入了半决赛,而且在晋级的道路上一路奋 "+
"战。即使是今天的比赛我们也有几个翻身的机会,但我们面对的对手非常强大,他们踢得非常好。”“我们的球迷应该为过去几个赛季里我们在冠军杯中的表现感到骄傲。”"+
"博斯克还说。对于博斯克在首发中排出了久疏战阵的坎比亚索,赛后有记者提出了质疑,认为完全应该将队内的另一 "+
"名球员帕文派遣上场以加强后卫线。对于这一疑议,博斯克拒绝承担所谓的“责任”,认为球队的首发没有问题。“我们按照整个赛季以来的方式做了,"+
"对于人员上的变化我没有什么可说的。”对于球队在本赛季的前景,博斯克表示皇马还有西甲联赛的冠军作为目标。“皇家马德里在冠军 "+
"杯中战斗到了最后,我们在联赛中也将这么做。”"+
"A Java User Group is a group of people who share a common interest in Java technology and meet on a regular basis to share"+
" technical ideas and information. The actual structure of a JUG can vary greatly - from a small number of friends and coworkers"+
" meeting informally in the evening, to a large group of companies based in the same geographic area. "+
"Regardless of the size and focus of a particular JUG, the sense of community spirit remains the same. ";
outputAnalyzer(a,input);
//此处我已经对大文本进行过测试,不会有问题效果很好
outputAnalyzer(a,"我爱共产党 ,,。 I love China 我喜欢唱歌 ");
assertAnalyzesTo(a,"我爱共产党 ,,。I love China 我喜欢唱歌",
new String[]{"爱","共产党","i","love","china","喜欢","唱歌"});
}
}
联系客服