Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer() 

Source Link

Document

Creates a new instance of the org.apache.lucene.analysis.standard.StandardTokenizer .

Usage

From source file:net.nunoachenriques.vader.text.TokenizerEnglishTest.java

License:Apache License

private List<String> cleanPunctuationAndSplitWhitespaceLucene(String s) {
    StringReader reader = new StringReader(s);
    StandardTokenizer removePunctuationTokenizer = new StandardTokenizer();
    removePunctuationTokenizer.setReader(reader);
    ArrayList<String> tokenizedString = null;
    try (TokenStream tokenStream = new LengthFilter(removePunctuationTokenizer, 2, Integer.MAX_VALUE)) {
        final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();/*from w  w  w . j  a  v a 2 s  . c  om*/
        tokenizedString = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            tokenizedString.add(charTermAttribute.toString());
        }
        tokenStream.end();
    } catch (IOException ioe) {
        ioe.printStackTrace();
    }
    return tokenizedString;
}

From source file:nl.knaw.huygens.timbuctoo.lucene.accentanalyzer.MySearchAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final StandardTokenizer src = new StandardTokenizer();
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tok = new StandardFilter(src);
    return new TokenStreamComponents(src, tok) {
        @Override/*from  www.j  av a2  s . c  o  m*/
        protected void setReader(final Reader reader) {
            src.setMaxTokenLength(MySearchAnalyzer.this.maxTokenLength);
            super.setReader(reader);
        }
    };
}

From source file:org.apache.jena.query.text.analyzer.ConfigurableAnalyzer.java

License:Apache License

private Tokenizer getTokenizer(String tokenizerName) {
    switch (tokenizerName) {
    case "KeywordTokenizer":
        return new KeywordTokenizer();
    case "LetterTokenizer":
        return new LetterTokenizer();
    case "StandardTokenizer":
        return new StandardTokenizer();
    case "WhitespaceTokenizer":
        return new WhitespaceTokenizer();
    default:/*from   w  w  w  .  j a  va  2  s  .c  o  m*/
        throw new TextIndexException("Unknown tokenizer : " + tokenizerName);
    }
}

From source file:org.apache.jena.query.text.filter.TestSelectiveFoldingFilter.java

License:Apache License

/**
 * Return the list of CharTermAttribute converted to a list of String's.
 *
 * @param whitelisted white-list//from   www  . j  a  va2  s .c o  m
 * @return list of CharTermAttribute converted to a list of String's
 * @throws IOException from Lucene API
 */
private List<String> collectTokens(StringReader inputText, CharArraySet whitelisted) throws IOException {
    StandardTokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(inputText);

    try (SelectiveFoldingFilter selectiveFoldingFilter = new SelectiveFoldingFilter(tokenizer, whitelisted)) {
        CharTermAttribute termAttrib = selectiveFoldingFilter.getAttribute(CharTermAttribute.class);
        selectiveFoldingFilter.reset();
        List<String> tokens = new ArrayList<>();
        while (selectiveFoldingFilter.incrementToken()) {
            tokens.add(termAttrib.toString());
        }
        selectiveFoldingFilter.end();
        return tokens;
    }
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType) {
    Tokenizer tokenizer = null;/* ww w .  ja va  2s  .c o m*/
    switch (tokenizerType) {
    case CLASSIC:
        tokenizer = new ClassicTokenizer();
        break;

    case STANDARD:
    default:
        tokenizer = new StandardTokenizer();
    }

    tokenizer.setReader(new StringReader(content));

    tokenStream = tokenizer;

    return tokenStream;
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(content));
    tokenStream = new LowerCaseFilter(tokenizer);
    tokenStream = applyStemmer(stemFilterType);
    ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
    shingleFilter.setOutputUnigrams(false);
    tokenStream = (TokenStream) shingleFilter;
    return tokenStream;
}

From source file:org.apache.vxquery.runtime.functions.index.CaseSensitiveAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer src;
    if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) {
        StandardTokenizer t = new StandardTokenizer();
        t.setMaxTokenLength(maxTokenLength);
        src = t;//from  w w w . j  a v  a  2  s . c o  m
    } else {
        StandardTokenizer40 t = new StandardTokenizer40();
        t.setMaxTokenLength(maxTokenLength);
        src = t;
    }
    TokenStream tok = new StandardFilter(src);
    tok = new StopFilter(tok, stopwords);
    return new TokenStreamComponents(src, tok) {
        @Override
        protected void setReader(final Reader reader) {
            int m = CaseSensitiveAnalyzer.this.maxTokenLength;
            if (src instanceof StandardTokenizer) {
                ((StandardTokenizer) src).setMaxTokenLength(m);
            } else {
                ((StandardTokenizer40) src).setMaxTokenLength(m);
            }
            super.setReader(reader);
        }
    };
}

From source file:org.codelibs.elasticsearch.index.analysis.FingerprintAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String s) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream stream = tokenizer;/* ww w  .  j a va  2  s  . com*/
    stream = new LowerCaseFilter(stream);
    stream = new ASCIIFoldingFilter(stream, false);
    stream = new StopFilter(stream, stopWords);
    stream = new FingerprintFilter(stream, maxOutputSize, separator);
    return new TokenStreamComponents(tokenizer, stream);
}

From source file:org.codelibs.elasticsearch.index.analysis.SnowballAnalyzer.java

License:Apache License

/** Constructs a {StandardTokenizer} filtered by a {@link
    StandardFilter}, a {LowerCaseFilter}, a {StopFilter},
    and a {SnowballFilter} *///from www .ja va  2 s  .  com
@Override
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream result = tokenizer;
    // remove the possessive 's for english stemmers
    if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) {
        result = new EnglishPossessiveFilter(result);
    }
    // Use a special lowercase filter for turkish, the stemmer expects it.
    if (name.equals("Turkish")) {
        result = new TurkishLowerCaseFilter(result);
    } else {
        result = new LowerCaseFilter(result);
    }
    if (stopSet != null) {
        result = new StopFilter(result, stopSet);
    }
    result = new SnowballFilter(result, name);
    return new TokenStreamComponents(tokenizer, result);
}

From source file:org.codelibs.elasticsearch.index.analysis.StandardHtmlStripAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer src = new StandardTokenizer();
    TokenStream tok = new StandardFilter(src);
    tok = new LowerCaseFilter(tok);
    if (!stopwords.isEmpty()) {
        tok = new StopFilter(tok, stopwords);
    }/*from   www .jav  a  2  s. co m*/
    return new TokenStreamComponents(src, tok);
}