List of usage examples for org.apache.lucene.analysis.pattern PatternReplaceFilter PatternReplaceFilter
public PatternReplaceFilter(TokenStream in, Pattern p, String replacement, boolean all)
From source file:at.itbh.bev.index.AddressLineExactMatchAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.HouseIdAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true); filter = new EdgeNGramTokenFilter(filter, 1, 4); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.HouseIdExactMatchAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.TextAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true); filter = new PhoneticFilter(filter, new ColognePhonetic(), true); filter = new NGramTokenFilter(filter, 2, 6); return new TokenStreamComponents(source, filter); }
From source file:org.apache.jackrabbit.oak.plugins.index.solr.configuration.DefaultAnalyzersConfigurationTest.java
License:Apache License
@Before public void setUp() throws Exception { this.exactPathAnalyzer = new Analyzer() { @Override// w ww.j ava2s. c o m protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; this.parentPathIndexingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; this.parentPathSearchingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source); filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false); filter = new ReverseStringFilter(Version.LUCENE_47, filter); return new TokenStreamComponents(source, filter); } }; this.directChildrenPathIndexingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source); filter = new LengthFilter(Version.LUCENE_47, filter, 2, Integer.MAX_VALUE); filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false); filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false); filter = new ReverseStringFilter(Version.LUCENE_47, filter); return new TokenStreamComponents(source, filter); } }; this.directChildrenPathSearchingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; this.allChildrenPathIndexingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new PathHierarchyTokenizer(reader); TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)")); filter = new RemoveDuplicatesTokenFilter(filter); return new TokenStreamComponents(source, filter); } }; this.allChildrenPathSearchingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; }
From source file:org.apache.solr.spelling.TestSuggestSpellingConverter.java
License:Apache License
public void testComplicated() throws Exception { // lowercases, removes field names, other syntax, collapses runs of whitespace, etc. converter.setAnalyzer(new Analyzer() { @Override/* w w w. jav a 2s. c o m*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile( "([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true); filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter); filter = new TrimFilter(TEST_VERSION_CURRENT, filter, false); return new TokenStreamComponents(tokenizer, filter); } }); assertConvertsTo("test1 +test2", new String[] { "test1 test2" }); assertConvertsTo("test~", new String[] { "test" }); assertConvertsTo("field:test", new String[] { "test" }); assertConvertsTo("This is a test", new String[] { "this is a test" }); assertConvertsTo(" This is a test", new String[] { "this is a test" }); assertConvertsTo("Foo (field:bar) text_hi:? ", new String[] { "foo bar ?" }); }
From source file:org.elasticsearch.analysis.common.PatternReplaceTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { return new PatternReplaceFilter(tokenStream, pattern, replacement, all); }
From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java
License:Open Source License
private TokenStream createStream(CorpusMetadata metadata, Tokenizer tokenizer) { TokenStream stream = new StandardFilter(tokenizer); stream = new LengthFilter(stream, (int) metadata.minTokenLength, (int) metadata.maxTokenLength); if (!metadata.stopWords.isEmpty()) { stream = getStopFilter(metadata.language, metadata.stopWords, stream); }/* w w w . j ava 2 s . c o m*/ if (metadata.applyStemmer > 0) { stream = getStemmerFilter(metadata.language, (int) metadata.applyStemmer, stream); } if (metadata.removeAccents) { stream = new ASCIIFoldingFilter(stream); } if (metadata.replaceNumbers) { stream = new PatternReplaceFilter(stream, NUMBER_PATTERN, NUMBER_PLACEHOLDER, false); } return stream; }
From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java
License:Open Source License
public static PerFieldAnalyzerWrapper getAnaylzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); //add a literal analyzer -- keep all special characters analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer); analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer); //treat as string field by analyzing with the KeywordAnalyzer analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer()); analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer()); analyzerPerField.put("isPreferred", new KeywordAnalyzer()); analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer()); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override/* ww w .j a va 2s . co m*/ protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, false); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp); } catch (NoClassDefFoundError e) { // } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp); } final CharArraySet dividerList = new CharArraySet(10, true); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); Analyzer qualifierAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String arg0) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new LowerCaseFilter(source); Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)"); filter = new PatternReplaceFilter(filter, pattern, " ", true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put("sources", sa); analyzerPerField.put("usageContexts", sa); analyzerPerField.put("qualifiers", qualifierAnalyzer); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }
From source file:sub.fwb.WildcardsAcceptingPatternReplaceFilterFactory.java
License:Apache License
@Override public PatternReplaceFilter create(TokenStream input) { return new PatternReplaceFilter(input, pattern, replacement, replaceAll); }