Example usage for org.apache.lucene.analysis.phonetic DoubleMetaphoneFilter DoubleMetaphoneFilter

List of usage examples for org.apache.lucene.analysis.phonetic DoubleMetaphoneFilter DoubleMetaphoneFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.phonetic DoubleMetaphoneFilter DoubleMetaphoneFilter.

Prototype

public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) 

Source Link

Document

Creates a DoubleMetaphoneFilter with the specified maximum code length, and either adding encoded forms as synonyms (inject=true) or replacing them.

Usage

From source file:com.example.PhoneticTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    if (encoder == null) {
        if (ruletype != null && nametype != null) {
            if (languageset != null) {
                final LanguageSet languages = LanguageSet.from(new HashSet<>(Arrays.asList(languageset)));
                return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true),
                        languages);/*from w w  w .  j a va  2 s  .  co  m*/
            }
            return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
        }
        if (maxcodelength > 0) {
            return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
        }
    } else {
        return new PhoneticFilter(tokenStream, encoder, !replace);
    }
    throw new IllegalArgumentException("encoder error");
}

From source file:org.dbpedia.spotlight.lucene.analysis.PhoneticAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(mMatchVersion, reader);
    result = new StandardFilter(mMatchVersion, result);
    result = new LowerCaseFilter(mMatchVersion, result); // lowercased only
    result = new StopFilter(mMatchVersion, result, mStopWordSet); // remove stopwords
    result = new DoubleMetaphoneFilter(result, mMaxCodeLength, true); // store phonetic code
    result = new ShingleFilter(result, 2, 3); // create token n-grams
    return result;
}

From source file:org.elasticsearch.index.analysis.PhoneticTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    if (encoder == null) {
        if (ruletype != null && nametype != null) {
            if (languageset != null) {
                final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset)));
                return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true),
                        languages);/*from   w  w w  .  jav  a 2  s.c o m*/
            }
            return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
        }
        if (maxcodelength > 0) {
            return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
        }
    } else {
        return new PhoneticFilter(tokenStream, encoder, !replace);
    }
    throw new IllegalArgumentException("encoder error");
}

From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java

License:Open Source License

public static PerFieldAnalyzerWrapper getAnaylzer() {

    Map<String, Analyzer> analyzerPerField = new HashMap<>();

    //add a literal analyzer -- keep all special characters
    analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer);
    analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer);

    //treat as string field by analyzing with the KeywordAnalyzer
    analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer());
    analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer());
    analyzerPerField.put("isPreferred", new KeywordAnalyzer());
    analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer());

    if (doubleMetaphoneEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override/*  w w w .j  a v  a  2  s.c o m*/
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new DoubleMetaphoneFilter(filter, 4, false);
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp);
    }

    if (normEnabled_) {
        try {
            Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET);
            analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp);
        } catch (NoClassDefFoundError e) {
            //
        }
    }

    if (stemmingEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new SnowballFilter(filter, "English");
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp);
    }

    final CharArraySet dividerList = new CharArraySet(10, true);
    dividerList.add(STRING_TOKENIZER_TOKEN);
    Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true));
    Analyzer qualifierAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String arg0) {
            final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
            source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
            TokenStream filter = new LowerCaseFilter(source);
            Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)");
            filter = new PatternReplaceFilter(filter, pattern, " ", true);
            return new TokenStreamComponents(source, filter);
        }

    };
    analyzerPerField.put("sources", sa);
    analyzerPerField.put("usageContexts", sa);
    analyzerPerField.put("qualifiers", qualifierAnalyzer);

    // no stop words, default character removal set.
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET),
            analyzerPerField);
    return analyzer;
}

From source file:org.lexevs.dao.index.metadata.BaseMetaDataLoader.java

License:Open Source License

public static Analyzer getMetadataAnalyzer() {
    Map<String, Analyzer> analyzerPerField = new HashMap<>();

    if (doubleMetaphoneEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override/*from  w ww . ja  v  a2s.  c  om*/
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new DoubleMetaphoneFilter(filter, 4, true);
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(doubleMetaphonePrefix_ + "propertyValue", temp);
    }

    if (normEnabled_) {
        try {
            Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET);
            analyzerPerField.put(normPrefix_ + "propertyValue", temp);
        } catch (NoClassDefFoundError e) {
            // norm is not available
            normEnabled_ = false;
        }
    }

    if (stemmingEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new SnowballFilter(filter, "English");
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(stemmingPrefix_ + "propertyValue", temp);
    }

    // these fields just get simple analyzing.
    List<String> dividerList = new ArrayList<String>();
    dividerList.add(STRING_TOKENIZER_TOKEN);
    Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true));
    analyzerPerField.put("parentContainers", sa);

    // no stop words, default character removal set.
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET),
            analyzerPerField);

    return analyzer;
}