Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java

License:Open Source License

public List<String> parseTerms(final String text, final Analyzer analyzer) {
    final List<String> terms = new LinkedList<String>();
    try {//w w w  .j  a v  a2s.c  o  m
        final TokenStream tokenStream = analyzer.tokenStream(null, text);
        tokenStream.reset();
        final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            final String term = attribute.toString();
            terms.add(term);
        }
        tokenStream.end();
        tokenStream.close();
    } catch (final IOException e) {
        // Cannot be thrown due to usage of a StringReader
    }
    return terms;
}

From source file:com.rubenlaguna.en4j.searchlucene.AnalyzerUtils.java

License:Open Source License

public static void displayTokens(TokenStream stream) throws IOException {
    TermAttribute term = (TermAttribute) stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] "); //B
    }//from  ww w .j  av  a 2 s . co  m
}

From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java

License:Apache License

/**
 * @param contentText input text to be parsed into terms
 * @return salient terms in order of appearance
 * (or null if this content should be ignored)
 *///from   ww  w .  j  a  va  2s .c o m
public List<String> getTermList(String contentText) {
    init();
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                // Here we skip runs of position increment markers created
                // by the ShingleFilter for stop words because they skew
                // the clustering/liblinear analysis.
                if (!term.matches("(_ )*_")) {
                    result.add(term);
                }
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:com.searchbox.SuggeterDataStructureBuilder.java

License:Apache License

private String[] getTokens(String fulltext) {
    LinkedList<String> tokens = new LinkedList<String>();
    try {//from   w  w  w  .  j av  a 2 s  .  co  m
        TokenStream tokenStream = analyzer.tokenStream(fields[0], new StringReader(fulltext));
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        while (tokenStream.incrementToken()) {
            String token = charTermAttribute.toString();
            tokens.add(token);
        }

    } catch (IOException ex) {
        LOGGER.error("Failure reading tokens from stream", ex);
    }
    return tokens.toArray(new String[0]);
}

From source file:com.searchcode.app.util.CodeAnalyzer.java

License:Open Source License

public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of the TokenStream API";

    CodeAnalyzer analyzer = new CodeAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    try {//from  w w w  .  j  a va2s  .c  o m
        stream.reset();

        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println(termAtt.toString());
        }

        stream.end();
    } finally {
        stream.close();
    }
}

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/*  ww w  .jav a  2 s  .co m*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();//from  ww w . j ava  2s .co  m
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        assertThat(new TokenInfo(term.toString(), pos)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();//  w w w  .  j av a 2s . c o m
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.LemmatizingTokenizerDemo.java

License:Apache License

private static void printTokens(TokenStream tokenStream) throws IOException {
    tokenStream.reset();// www.j  a va  2  s  .co  m
    while (tokenStream.incrementToken()) {
        System.out.println(tokenStream);
    }
}

From source file:com.shaie.SynonymFilterExample.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymGraphFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();/*from   w ww  . j  av  a  2  s  . c  om*/
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}