List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer()
From source file:org.nlp4l.framework.builtin.kea.KEAStandardAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream lcf = new LowerCaseFilter(source); if (n == 1) { TokenStream stf = new KEAStopFilter(lcf, n, stopWords, beginStopWords, endStopWords); return new TokenStreamComponents(source, stf); } else {/*from w w w . j av a 2s. c om*/ assert n >= 2; ShingleFilter shf = new ShingleFilter(lcf, n, n); shf.setOutputUnigrams(false); KEAStopFilter keasf = new KEAStopFilter(shf, n, stopWords, beginStopWords, endStopWords); return new TokenStreamComponents(source, keasf); } }
From source file:org.silverpeas.core.index.indexing.model.WAAnalyzer.java
License:Open Source License
/** * Returns a tokens stream built on top of the given reader. * *///from w w w.ja v a 2s .co m @Override protected TokenStreamComponents createComponents(final String s) { final Tokenizer source = new StandardTokenizer(); // remove 's and . from token TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); // remove some unexplicit terms result = new StopFilter(result, FrenchAnalyzer.getDefaultStopSet()); // remove [cdjlmnst-qu]' from token result = new ElisionFilter(result, FrenchAnalyzer.DEFAULT_ARTICLES); if (snowballUsed) { // Important! Strings given to Snowball filter must contains accents // so accents must be removed after stemmer have done the job // ignoring singular/plural, male/female and conjugated forms result = new SnowballFilter(result, stemmer); } // remove accents result = new ASCIIFoldingFilter(result); return new TokenStreamComponents(source, result); }
From source file:org.tallison.lucene.search.concordance.ConcordanceTestBase.java
License:Apache License
public static Analyzer getCJKBigramAnalyzer(final boolean outputUnigrams) { return new Analyzer() { @Override/*from w ww . ja v a2 s .co m*/ public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(); TokenFilter filter = new CJKBigramFilter(tokenizer, 15, outputUnigrams); return new TokenStreamComponents(tokenizer, filter); } @Override public int getPositionIncrementGap(String fieldName) { return 10; } @Override public int getOffsetGap(String fieldName) { return 10; } }; }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?StandardTokenizer/* w w w . ja v a 2s . c o m*/ */ public void testST() { Tokenizer tokenizer = new StandardTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:ri.AnalyzerNuevo.java
@Override protected TokenStreamComponents createComponents(String string) { //To change body of generated methods, choose Tools | Templates. final Tokenizer source = new StandardTokenizer(); Reader reader = new StringReader(string); source.setReader(reader);/*w w w . ja va2 s. c o m*/ //SynonymMap.Builder builder = new SynonymMap.Builder(true); //builder.add(new CharsRef("text"), new CharsRef("documento"), true); //SynonymMap synonymMap; TokenStream pipeline = source; pipeline = new StandardFilter(pipeline); pipeline = new EnglishPossessiveFilter(pipeline); /*try { synonymMap = builder.build(); pipeline = new SynonymFilter(pipeline,synonymMap,true); } catch (IOException ex) { Logger.getLogger(AnalyzerNuevo.class.getName()).log(Level.SEVERE, null, ex); }*/ pipeline = new ASCIIFoldingFilter(pipeline); pipeline = new LowerCaseFilter(pipeline); pipeline = new StopFilter(pipeline, new CharArraySet(stopwords, true)); pipeline = new PorterStemFilter(pipeline); return new TokenStreamComponents(source, pipeline); }