Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java

License:Open Source License

public List<String> parseTerms(final String text, final Analyzer analyzer) {
    final List<String> terms = new LinkedList<String>();
    try {//w w w  .j  a v  a2s.c  o  m
        final TokenStream tokenStream = analyzer.tokenStream(null, text);
        tokenStream.reset();
        final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            final String term = attribute.toString();
            terms.add(term);
        }
        tokenStream.end();
        tokenStream.close();
    } catch (final IOException e) {
        // Cannot be thrown due to usage of a StringReader
    }
    return terms;
}

From source file:com.rubenlaguna.en4j.searchlucene.AnalyzerUtils.java

License:Open Source License

public static void displayTokens(TokenStream stream) throws IOException {
    TermAttribute term = (TermAttribute) stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] "); //B
    }//from  ww w .j  av  a 2 s . co  m
}

From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java

License:Apache License

/**
 * @param contentText input text to be parsed into terms
 * @return salient terms in order of appearance
 * (or null if this content should be ignored)
 *///from   ww  w .  j  a  va  2s .c o m
public List<String> getTermList(String contentText) {
    init();
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                // Here we skip runs of position increment markers created
                // by the ShingleFilter for stop words because they skew
                // the clustering/liblinear analysis.
                if (!term.matches("(_ )*_")) {
                    result.add(term);
                }
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:com.searchbox.SuggeterDataStructureBuilder.java

License:Apache License

private String[] getTokens(String fulltext) {
    LinkedList<String> tokens = new LinkedList<String>();
    try {//from   w  w  w  .  j av  a 2 s  .  co  m
        TokenStream tokenStream = analyzer.tokenStream(fields[0], new StringReader(fulltext));
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        while (tokenStream.incrementToken()) {
            String token = charTermAttribute.toString();
            tokens.add(token);
        }

    } catch (IOException ex) {
        LOGGER.error("Failure reading tokens from stream", ex);
    }
    return tokens.toArray(new String[0]);
}

From source file:com.searchcode.app.util.CodeAnalyzer.java

License:Open Source License

public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of the TokenStream API";

    CodeAnalyzer analyzer = new CodeAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    try {//from  w w w  .  j  a va2s  .c  o m
        stream.reset();

        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println(termAtt.toString());
        }

        stream.end();
    } finally {
        stream.close();
    }
}

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/*  ww w  .jav a  2 s  .co m*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();//from  ww w . j ava  2s .co  m
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        assertThat(new TokenInfo(term.toString(), pos)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();//  w w w  .  j av a 2s . c o m
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.LemmatizingTokenizerDemo.java

License:Apache License

private static void printTokens(TokenStream tokenStream) throws IOException {
    tokenStream.reset();// www.j  a va  2  s  .co  m
    while (tokenStream.incrementToken()) {
        System.out.println(tokenStream);
    }
}

From source file:com.shaie.SynonymFilterExample.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymGraphFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();/*from   w ww  . j  av  a  2  s  . c  om*/
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}