List of usage examples for org.apache.lucene.analysis.phonetic DoubleMetaphoneFilter DoubleMetaphoneFilter
public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject)
inject=true) or replacing them. From source file:com.example.PhoneticTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { if (encoder == null) { if (ruletype != null && nametype != null) { if (languageset != null) { final LanguageSet languages = LanguageSet.from(new HashSet<>(Arrays.asList(languageset))); return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);/*from w w w . j a va 2 s . co m*/ } return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true)); } if (maxcodelength > 0) { return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace); } } else { return new PhoneticFilter(tokenStream, encoder, !replace); } throw new IllegalArgumentException("encoder error"); }
From source file:org.dbpedia.spotlight.lucene.analysis.PhoneticAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(mMatchVersion, reader); result = new StandardFilter(mMatchVersion, result); result = new LowerCaseFilter(mMatchVersion, result); // lowercased only result = new StopFilter(mMatchVersion, result, mStopWordSet); // remove stopwords result = new DoubleMetaphoneFilter(result, mMaxCodeLength, true); // store phonetic code result = new ShingleFilter(result, 2, 3); // create token n-grams return result; }
From source file:org.elasticsearch.index.analysis.PhoneticTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { if (encoder == null) { if (ruletype != null && nametype != null) { if (languageset != null) { final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset))); return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);/*from w w w . jav a 2 s.c o m*/ } return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true)); } if (maxcodelength > 0) { return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace); } } else { return new PhoneticFilter(tokenStream, encoder, !replace); } throw new IllegalArgumentException("encoder error"); }
From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java
License:Open Source License
public static PerFieldAnalyzerWrapper getAnaylzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); //add a literal analyzer -- keep all special characters analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer); analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer); //treat as string field by analyzing with the KeywordAnalyzer analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer()); analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer()); analyzerPerField.put("isPreferred", new KeywordAnalyzer()); analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer()); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override/* w w w .j a v a 2 s.c o m*/ protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, false); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp); } catch (NoClassDefFoundError e) { // } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp); } final CharArraySet dividerList = new CharArraySet(10, true); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); Analyzer qualifierAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String arg0) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new LowerCaseFilter(source); Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)"); filter = new PatternReplaceFilter(filter, pattern, " ", true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put("sources", sa); analyzerPerField.put("usageContexts", sa); analyzerPerField.put("qualifiers", qualifierAnalyzer); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }
From source file:org.lexevs.dao.index.metadata.BaseMetaDataLoader.java
License:Open Source License
public static Analyzer getMetadataAnalyzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override/*from w ww . ja v a2s. c om*/ protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(doubleMetaphonePrefix_ + "propertyValue", temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(normPrefix_ + "propertyValue", temp); } catch (NoClassDefFoundError e) { // norm is not available normEnabled_ = false; } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(stemmingPrefix_ + "propertyValue", temp); } // these fields just get simple analyzing. List<String> dividerList = new ArrayList<String>(); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); analyzerPerField.put("parentContainers", sa); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }