List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer setReader
public final void setReader(Reader input)
From source file:net.nunoachenriques.vader.text.TokenizerEnglishTest.java
License:Apache License
private List<String> cleanPunctuationAndSplitWhitespaceLucene(String s) { StringReader reader = new StringReader(s); StandardTokenizer removePunctuationTokenizer = new StandardTokenizer(); removePunctuationTokenizer.setReader(reader); ArrayList<String> tokenizedString = null; try (TokenStream tokenStream = new LengthFilter(removePunctuationTokenizer, 2, Integer.MAX_VALUE)) { final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from w ww . j a v a 2 s. c om*/ tokenizedString = new ArrayList<>(); while (tokenStream.incrementToken()) { tokenizedString.add(charTermAttribute.toString()); } tokenStream.end(); } catch (IOException ioe) { ioe.printStackTrace(); } return tokenizedString; }
From source file:org.apache.jena.query.text.filter.TestSelectiveFoldingFilter.java
License:Apache License
/** * Return the list of CharTermAttribute converted to a list of String's. * * @param whitelisted white-list//from www. ja v a2 s.c o m * @return list of CharTermAttribute converted to a list of String's * @throws IOException from Lucene API */ private List<String> collectTokens(StringReader inputText, CharArraySet whitelisted) throws IOException { StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(inputText); try (SelectiveFoldingFilter selectiveFoldingFilter = new SelectiveFoldingFilter(tokenizer, whitelisted)) { CharTermAttribute termAttrib = selectiveFoldingFilter.getAttribute(CharTermAttribute.class); selectiveFoldingFilter.reset(); List<String> tokens = new ArrayList<>(); while (selectiveFoldingFilter.incrementToken()) { tokens.add(termAttrib.toString()); } selectiveFoldingFilter.end(); return tokens; } }