List of usage examples for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer
public KeywordTokenizer()
From source file:at.itbh.bev.index.AddressLineExactMatchAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.HouseIdAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true); filter = new EdgeNGramTokenFilter(filter, 1, 4); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.HouseIdExactMatchAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.PostalCodeAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new EdgeNGramTokenFilter(filter, 3, 4); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.TextAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true); filter = new PhoneticFilter(filter, new ColognePhonetic(), true); filter = new NGramTokenFilter(filter, 2, 6); return new TokenStreamComponents(source, filter); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.WikiURLAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new KeywordTokenizer(); TokenStream result = new StandardFilter(source); result = new CharacterFilter(result); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(result); // result = new WordDelimiterFilter(result, WordDelimiterFilter.DIGIT, null); return new TokenStreamComponents(source, result); }
From source file:org.apache.jena.query.text.analyzer.ConfigurableAnalyzer.java
License:Apache License
private Tokenizer getTokenizer(String tokenizerName) { switch (tokenizerName) { case "KeywordTokenizer": return new KeywordTokenizer(); case "LetterTokenizer": return new LetterTokenizer(); case "StandardTokenizer": return new StandardTokenizer(); case "WhitespaceTokenizer": return new WhitespaceTokenizer(); default://from w w w. j ava2 s. c o m throw new TextIndexException("Unknown tokenizer : " + tokenizerName); } }
From source file:org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName) { KeywordTokenizer source = new KeywordTokenizer(); LowerCaseFilter filter = new LowerCaseFilter(source); return new TokenStreamComponents(source, filter); }
From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java
License:Apache License
@Test public void testRSLPRules() throws Exception { Index index = new Index("test"); Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "br_rslp").build(); AnalysisService analysisService = createAnalysisService(index, settings); TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer"); Tokenizer tokenizer = new KeywordTokenizer(); Map<String, String> words = buildWordList(); Set<String> inputWords = words.keySet(); for (String word : inputWords) { tokenizer.setReader(new StringReader(word)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset();//from w ww . ja va2 s . co m assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(words.get(word))); ts.close(); } }
From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java
License:Apache License
private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException { Tokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(string1)); TokenStream stream1 = factory.create(tokenizer); tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(string2)); TokenStream stream2 = factory.create(tokenizer); assertCollation(stream1, stream2, comparison); }