List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer()
From source file:net.nunoachenriques.vader.text.TokenizerEnglishTest.java
License:Apache License
private List<String> cleanPunctuationAndSplitWhitespaceLucene(String s) { StringReader reader = new StringReader(s); StandardTokenizer removePunctuationTokenizer = new StandardTokenizer(); removePunctuationTokenizer.setReader(reader); ArrayList<String> tokenizedString = null; try (TokenStream tokenStream = new LengthFilter(removePunctuationTokenizer, 2, Integer.MAX_VALUE)) { final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from w w w . j a v a 2 s . c om*/ tokenizedString = new ArrayList<>(); while (tokenStream.incrementToken()) { tokenizedString.add(charTermAttribute.toString()); } tokenStream.end(); } catch (IOException ioe) { ioe.printStackTrace(); } return tokenizedString; }
From source file:nl.knaw.huygens.timbuctoo.lucene.accentanalyzer.MySearchAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final StandardTokenizer src = new StandardTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(src); return new TokenStreamComponents(src, tok) { @Override/*from www.j av a2 s . c o m*/ protected void setReader(final Reader reader) { src.setMaxTokenLength(MySearchAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
From source file:org.apache.jena.query.text.analyzer.ConfigurableAnalyzer.java
License:Apache License
private Tokenizer getTokenizer(String tokenizerName) { switch (tokenizerName) { case "KeywordTokenizer": return new KeywordTokenizer(); case "LetterTokenizer": return new LetterTokenizer(); case "StandardTokenizer": return new StandardTokenizer(); case "WhitespaceTokenizer": return new WhitespaceTokenizer(); default:/*from w w w . j a va 2 s .c o m*/ throw new TextIndexException("Unknown tokenizer : " + tokenizerName); } }
From source file:org.apache.jena.query.text.filter.TestSelectiveFoldingFilter.java
License:Apache License
/** * Return the list of CharTermAttribute converted to a list of String's. * * @param whitelisted white-list//from www . j a va2 s .c o m * @return list of CharTermAttribute converted to a list of String's * @throws IOException from Lucene API */ private List<String> collectTokens(StringReader inputText, CharArraySet whitelisted) throws IOException { StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(inputText); try (SelectiveFoldingFilter selectiveFoldingFilter = new SelectiveFoldingFilter(tokenizer, whitelisted)) { CharTermAttribute termAttrib = selectiveFoldingFilter.getAttribute(CharTermAttribute.class); selectiveFoldingFilter.reset(); List<String> tokens = new ArrayList<>(); while (selectiveFoldingFilter.incrementToken()) { tokens.add(termAttrib.toString()); } selectiveFoldingFilter.end(); return tokens; } }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType) { Tokenizer tokenizer = null;/* ww w . ja va 2s .c o m*/ switch (tokenizerType) { case CLASSIC: tokenizer = new ClassicTokenizer(); break; case STANDARD: default: tokenizer = new StandardTokenizer(); } tokenizer.setReader(new StringReader(content)); tokenStream = tokenizer; return tokenStream; }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) { Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(content)); tokenStream = new LowerCaseFilter(tokenizer); tokenStream = applyStemmer(stemFilterType); ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram); shingleFilter.setOutputUnigrams(false); tokenStream = (TokenStream) shingleFilter; return tokenStream; }
From source file:org.apache.vxquery.runtime.functions.index.CaseSensitiveAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer src; if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { StandardTokenizer t = new StandardTokenizer(); t.setMaxTokenLength(maxTokenLength); src = t;//from w w w . j a v a 2 s . c o m } else { StandardTokenizer40 t = new StandardTokenizer40(); t.setMaxTokenLength(maxTokenLength); src = t; } TokenStream tok = new StandardFilter(src); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) { int m = CaseSensitiveAnalyzer.this.maxTokenLength; if (src instanceof StandardTokenizer) { ((StandardTokenizer) src).setMaxTokenLength(m); } else { ((StandardTokenizer40) src).setMaxTokenLength(m); } super.setReader(reader); } }; }
From source file:org.codelibs.elasticsearch.index.analysis.FingerprintAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String s) { final Tokenizer tokenizer = new StandardTokenizer(); TokenStream stream = tokenizer;/* ww w . j a va 2 s . com*/ stream = new LowerCaseFilter(stream); stream = new ASCIIFoldingFilter(stream, false); stream = new StopFilter(stream, stopWords); stream = new FingerprintFilter(stream, maxOutputSize, separator); return new TokenStreamComponents(tokenizer, stream); }
From source file:org.codelibs.elasticsearch.index.analysis.SnowballAnalyzer.java
License:Apache License
/** Constructs a {StandardTokenizer} filtered by a {@link StandardFilter}, a {LowerCaseFilter}, a {StopFilter}, and a {SnowballFilter} *///from www .ja va 2 s . com @Override public TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new StandardTokenizer(); TokenStream result = tokenizer; // remove the possessive 's for english stemmers if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (name.equals("Turkish")) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(result); } if (stopSet != null) { result = new StopFilter(result, stopSet); } result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
From source file:org.codelibs.elasticsearch.index.analysis.StandardHtmlStripAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer src = new StandardTokenizer(); TokenStream tok = new StandardFilter(src); tok = new LowerCaseFilter(tok); if (!stopwords.isEmpty()) { tok = new StopFilter(tok, stopwords); }/*from www .jav a 2 s. co m*/ return new TokenStreamComponents(src, tok); }