package org.wikimedia.lsearch.analyzers;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.th.ThaiWordFilter;
import org.apache.log4j.Logger;
/**
* Make a language-dependent pair of filters. The custom filter is to be applied before the stemmer.
*
* @author rainman
*
*/
public class FilterFactory {
static org.apache.log4j.Logger log = Logger.getLogger(FilterFactory.class);
protected String lang;
protected String snowballName = null;
protected boolean useStemmer,useCustomFilter;
protected Class stemmer = null;
protected Class customFilter = null;
protected boolean usingCJK = false;
protected FilterFactory noStemmerFilterFactory=null;
public FilterFactory(String lang){
this.lang = lang;
init();
noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useCustomFilter,null,customFilter);
}
public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useCustomFilter, Class stemmer, Class customFilter) {
this.lang = lang;
this.snowballName = snowballName;
this.useStemmer = useStemmer;
this.useCustomFilter = useCustomFilter;
this.stemmer = stemmer;
this.customFilter = customFilter;
}
public FilterFactory getNoStemmerFilterFactory() {
if(noStemmerFilterFactory == null)
return this;
else
return noStemmerFilterFactory;
}
protected void init(){
if(lang == null)
lang = "en";
// figure out stemmer
useStemmer = true;
if(lang.equals("en"))
snowballName = "English";
//stemmer = PorterStemFilter.class; // 2x faster but less accurate
else if(lang.equals("da"))
snowballName = "Danish";
else if(lang.equals("nl"))
snowballName = "Dutch";
else if(lang.equals("fi"))
snowballName = "Finnish";
else if(lang.equals("de"))
snowballName = "German";
else if(lang.equals("it"))
snowballName = "Italian";
else if(lang.equals("no"))
snowballName = "Norwegian";
else if(lang.equals("pt"))
snowballName = "Portuguese";
else if(lang.equals("ru"))
//snowballName = "Russian";
stemmer = RussianStemFilter.class;
else if(lang.equals("es"))
snowballName = "Spanish";
else if(lang.equals("sv"))
snowballName = "Swedish";
else if(lang.equals("eo"))
stemmer = EsperantoStemFilter.class;
else
useStemmer = false;
// figure out custom filter
useCustomFilter = true;
if(lang.equals("th"))
customFilter = ThaiWordFilter.class;
else if(lang.equals("sr"))
customFilter = SerbianFilter.class;
else if(lang.equals("vi"))
customFilter = VietnameseFilter.class;
else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") ||
lang.equals("zh-classical") || lang.equals("zh-yue")){
customFilter = CJKFilter.class;
usingCJK = true;
} else
useCustomFilter = false;
}
public TokenFilter makeStemmer(TokenStream in){
if(!useStemmer) {
log.debug( "No stemmer" );
return null;
} else if(snowballName != null) {
log.debug( "use SnowBall " + snowballName );
return new SnowballFilter(in,snowballName);
} else if(stemmer != null){
try {
return (TokenFilter) stemmer.getConstructor(TokenStream.class).newInstance(in);
} catch (Exception e) {
e.printStackTrace();
}
}
return null;
}
public TokenFilter makeCustomFilter(TokenStream in){
if(!useCustomFilter)
return null;
else if(customFilter != null){
try {
return (TokenFilter) customFilter.getConstructor(TokenStream.class).newInstance(in);
} catch (Exception e) {
e.printStackTrace();
}
}
return null;
}
public boolean hasStemmer(){
return useStemmer;
}
public boolean isUsingCJK() {
return usingCJK;
}
public boolean hasCustomFilter(){
return useCustomFilter;
}
public String getLanguage(){
return lang;
}
}