Example usage for org.apache.lucene.analysis.pattern PatternReplaceFilter PatternReplaceFilter

List of usage examples for org.apache.lucene.analysis.pattern PatternReplaceFilter PatternReplaceFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.pattern PatternReplaceFilter PatternReplaceFilter.

Prototype

public PatternReplaceFilter(TokenStream in, Pattern p, String replacement, boolean all) 

Source Link

Document

Constructs an instance to replace either the first, or all occurrences

Usage

From source file:at.itbh.bev.index.AddressLineExactMatchAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.HouseIdAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true);
    filter = new EdgeNGramTokenFilter(filter, 1, 4);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.HouseIdExactMatchAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.TextAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true);
    filter = new PhoneticFilter(filter, new ColognePhonetic(), true);
    filter = new NGramTokenFilter(filter, 2, 6);
    return new TokenStreamComponents(source, filter);
}

From source file:org.apache.jackrabbit.oak.plugins.index.solr.configuration.DefaultAnalyzersConfigurationTest.java

License:Apache License

@Before
public void setUp() throws Exception {
    this.exactPathAnalyzer = new Analyzer() {
        @Override// w  ww.j ava2s. c o  m
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathIndexingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathSearchingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source);
            filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
            filter = new ReverseStringFilter(Version.LUCENE_47, filter);
            return new TokenStreamComponents(source, filter);
        }
    };

    this.directChildrenPathIndexingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source);
            filter = new LengthFilter(Version.LUCENE_47, filter, 2, Integer.MAX_VALUE);
            filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
            filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
            filter = new ReverseStringFilter(Version.LUCENE_47, filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathSearchingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };

    this.allChildrenPathIndexingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new PathHierarchyTokenizer(reader);
            TokenStream filter = new PatternCaptureGroupTokenFilter(source, false,
                    Pattern.compile("((\\/).*)"));
            filter = new RemoveDuplicatesTokenFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.allChildrenPathSearchingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
}

From source file:org.apache.solr.spelling.TestSuggestSpellingConverter.java

License:Apache License

public void testComplicated() throws Exception {
    // lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
    converter.setAnalyzer(new Analyzer() {
        @Override/* w w  w. jav a  2s.  c o  m*/
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new KeywordTokenizer(reader);
            TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile(
                    "([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"),
                    " ", true);
            filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
            filter = new TrimFilter(TEST_VERSION_CURRENT, filter, false);
            return new TokenStreamComponents(tokenizer, filter);
        }
    });
    assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
    assertConvertsTo("test~", new String[] { "test" });
    assertConvertsTo("field:test", new String[] { "test" });
    assertConvertsTo("This is a test", new String[] { "this is a test" });
    assertConvertsTo(" This is  a test", new String[] { "this is a test" });
    assertConvertsTo("Foo (field:bar) text_hi:?    ",
            new String[] { "foo bar ?" });
}

From source file:org.elasticsearch.analysis.common.PatternReplaceTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    return new PatternReplaceFilter(tokenStream, pattern, replacement, all);
}

From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java

License:Open Source License

private TokenStream createStream(CorpusMetadata metadata, Tokenizer tokenizer) {
    TokenStream stream = new StandardFilter(tokenizer);
    stream = new LengthFilter(stream, (int) metadata.minTokenLength, (int) metadata.maxTokenLength);

    if (!metadata.stopWords.isEmpty()) {
        stream = getStopFilter(metadata.language, metadata.stopWords, stream);
    }/* w w  w  .  j ava  2 s .  c o  m*/

    if (metadata.applyStemmer > 0) {
        stream = getStemmerFilter(metadata.language, (int) metadata.applyStemmer, stream);
    }

    if (metadata.removeAccents) {
        stream = new ASCIIFoldingFilter(stream);
    }

    if (metadata.replaceNumbers) {
        stream = new PatternReplaceFilter(stream, NUMBER_PATTERN, NUMBER_PLACEHOLDER, false);
    }

    return stream;
}

From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java

License:Open Source License

public static PerFieldAnalyzerWrapper getAnaylzer() {

    Map<String, Analyzer> analyzerPerField = new HashMap<>();

    //add a literal analyzer -- keep all special characters
    analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer);
    analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer);

    //treat as string field by analyzing with the KeywordAnalyzer
    analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer());
    analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer());
    analyzerPerField.put("isPreferred", new KeywordAnalyzer());
    analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer());

    if (doubleMetaphoneEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override/*  ww  w .j  a  va  2s .  co  m*/
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new DoubleMetaphoneFilter(filter, 4, false);
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp);
    }

    if (normEnabled_) {
        try {
            Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET);
            analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp);
        } catch (NoClassDefFoundError e) {
            //
        }
    }

    if (stemmingEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new SnowballFilter(filter, "English");
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp);
    }

    final CharArraySet dividerList = new CharArraySet(10, true);
    dividerList.add(STRING_TOKENIZER_TOKEN);
    Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true));
    Analyzer qualifierAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String arg0) {
            final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
            source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
            TokenStream filter = new LowerCaseFilter(source);
            Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)");
            filter = new PatternReplaceFilter(filter, pattern, " ", true);
            return new TokenStreamComponents(source, filter);
        }

    };
    analyzerPerField.put("sources", sa);
    analyzerPerField.put("usageContexts", sa);
    analyzerPerField.put("qualifiers", qualifierAnalyzer);

    // no stop words, default character removal set.
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET),
            analyzerPerField);
    return analyzer;
}

From source file:sub.fwb.WildcardsAcceptingPatternReplaceFilterFactory.java

License:Apache License

@Override
public PatternReplaceFilter create(TokenStream input) {
    return new PatternReplaceFilter(input, pattern, replacement, replaceAll);
}