为推文优化的Lucene Analyzer类
2021-07-04 00:03
标签:extend length cat inf filter ret add pen lan 为推文优化的Lucene Analyzer类 标签:extend length cat inf filter ret add pen lan 原文地址:http://www.cnblogs.com/ljbguanli/p/7122997.html/***
* @author YangXin
* @info 使用Doublemetaphone函数对Twitter优化。
* Doublemetaphone函数能够为发音类似的单词创建同样的键
*
*/
package unitTwelve;
import java.io.IOException;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
public class TwitterAnalyzer extends Analyzer{
private DoubleMetaphone filter = new DoubleMetaphone();
public TokenStream result = new PorterStemFilter(new StopFilter(true, new StandardTokenizer(Version.LUCENE_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET));
TermAttribute termAtt = (TermAttribute) result.addAttribute(TermAttribute.class);
StringBuilder buf = new StringBuilder();
try{
while(result.incrementToken()){
String word = new String(termAtt.term(), 0, termAtt.termLength());
buf.append(filter.encode(filter.encode(word)).append(" "));
}
}catch(IOException e){
e.printStackTrace();
}
return new WhitespaceTokenizer(new StringReader(buf.toString()));
}
}
下一篇:html空格占位符