Example usage for org.apache.lucene.analysis.standard StandardTokenizer setReader

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer setReader

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer setReader.

Prototype

public final void setReader(Reader input) 

Source Link

Document

Expert: Set a new reader on the Tokenizer.

Usage

From source file:net.nunoachenriques.vader.text.TokenizerEnglishTest.java

License:Apache License

private List<String> cleanPunctuationAndSplitWhitespaceLucene(String s) {
    StringReader reader = new StringReader(s);
    StandardTokenizer removePunctuationTokenizer = new StandardTokenizer();
    removePunctuationTokenizer.setReader(reader);
    ArrayList<String> tokenizedString = null;
    try (TokenStream tokenStream = new LengthFilter(removePunctuationTokenizer, 2, Integer.MAX_VALUE)) {
        final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();/*from  w  ww . j  a  v a 2  s.  c  om*/
        tokenizedString = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            tokenizedString.add(charTermAttribute.toString());
        }
        tokenStream.end();
    } catch (IOException ioe) {
        ioe.printStackTrace();
    }
    return tokenizedString;
}

From source file:org.apache.jena.query.text.filter.TestSelectiveFoldingFilter.java

License:Apache License

/**
 * Return the list of CharTermAttribute converted to a list of String's.
 *
 * @param whitelisted white-list//from   www.  ja  v  a2 s.c  o m
 * @return list of CharTermAttribute converted to a list of String's
 * @throws IOException from Lucene API
 */
private List<String> collectTokens(StringReader inputText, CharArraySet whitelisted) throws IOException {
    StandardTokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(inputText);

    try (SelectiveFoldingFilter selectiveFoldingFilter = new SelectiveFoldingFilter(tokenizer, whitelisted)) {
        CharTermAttribute termAttrib = selectiveFoldingFilter.getAttribute(CharTermAttribute.class);
        selectiveFoldingFilter.reset();
        List<String> tokens = new ArrayList<>();
        while (selectiveFoldingFilter.incrementToken()) {
            tokens.add(termAttrib.toString());
        }
        selectiveFoldingFilter.end();
        return tokens;
    }
}