List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer()
From source file:org.elasticsearch.index.analysis.CustomWordBoundaryStandardTokenizerFactory.java
License:Apache License
@Override public Tokenizer create() { if (version.onOrAfter(Version.LUCENE_5_5_0)) { return new CustomWordBoundaryStandardTokenizer(characterMappings); } else {//from w w w . j a v a 2s. c o m return new StandardTokenizer(); } }
From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java
License:Apache License
public void testLenientParser() throws IOException, ParseException { ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer()); String rules = "&,and\n" + "come,advance,approach\n"; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);//from w ww. ja va 2 s . c o m SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("approach quietly then advance & destroy")); TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false); assertTokenStreamContents(ts, new String[] { "come", "quietly", "then", "come", "destroy" }); }
From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java
License:Apache License
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { CharArraySet stopSet = new CharArraySet(1, true); stopSet.add("bar"); ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); String rules = "foo,bar,baz"; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);//from w ww . j av a2 s. com SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" }); }
From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java
License:Apache License
public void testLenientParser() throws IOException, ParseException { ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer()); String rules = "s(100000001,1,'&',a,1,0).\n" + "s(100000001,2,'and',a,1,0).\n" + "s(100000002,1,'come',v,1,0).\n" + "s(100000002,2,'advance',v,1,0).\n" + "s(100000002,3,'approach',v,1,0)."; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);/*from w ww . j a v a 2 s .c o m*/ SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("approach quietly then advance & destroy")); TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false); assertTokenStreamContents(ts, new String[] { "come", "quietly", "then", "come", "destroy" }); }
From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java
License:Apache License
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { CharArraySet stopSet = new CharArraySet(1, true); stopSet.add("bar"); ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n" + "s(100000001,3,'baz',v,1,0)."; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);// w w w . j av a 2 s . co m SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" }); }
From source file:org.elasticsearch.index.analysis.KeepTypesFilterFactoryTests.java
License:Apache License
@Test public void testKeepTypes() throws IOException { Settings settings = Settings.settingsBuilder().put("path.home", createTempDir().toString()) .put("index.analysis.filter.keep_numbers.type", "keep_types") .putArray("index.analysis.filter.keep_numbers.types", new String[] { "<NUM>", "<SOMETHINGELSE>" }) .build();/*from ww w . ja va2s . c o m*/ AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep_numbers"); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); String source = "Hello 123 world"; String[] expected = new String[] { "123" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 }); }
From source file:org.elasticsearch.index.analysis.SnowballAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, and a {@link SnowballFilter} */ @Override// www . j a va 2 s. c o m public TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer; if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { tokenizer = new StandardTokenizer(); } else { tokenizer = new StandardTokenizer40(); } TokenStream result = tokenizer; // remove the possessive 's for english stemmers if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) result = new EnglishPossessiveFilter(result); // Use a special lowercase filter for turkish, the stemmer expects it. if (name.equals("Turkish")) result = new TurkishLowerCaseFilter(result); else result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(result, stopSet); result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
From source file:org.elasticsearch.index.query.MockRepeatAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(); TokenStream repeatFilter = new MockRepeatFilter(tokenizer); return new TokenStreamComponents(tokenizer, repeatFilter); }
From source file:org.lambda3.indra.core.IndraAnalyzer.java
License:Open Source License
public IndraAnalyzer(String lang, ModelMetadata metadata) { if (lang == null || metadata == null) { throw new IllegalArgumentException("all parameters are mandatory."); }/* w w w. j a va 2 s .co m*/ logger.debug("Creating analyzer, lang={}, preprocessing={}", lang, metadata); tokenizer = new StandardTokenizer(); tokenStream = createStream(lang, metadata, tokenizer); }
From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java
License:Open Source License
StandardPreProcessorIterator(CorpusMetadata metadata, String text) { this.metadata = Objects.requireNonNull(metadata); ;/*from ww w .j a va2s . c o m*/ this.tokenizer = new StandardTokenizer(); this.tokenStream = createStream(metadata, tokenizer); setTransformers(); initialize(Objects.requireNonNull(text)); }