Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java

License:Open Source License

public List<String> parseTerms(final String text, final Analyzer analyzer) {
    final List<String> terms = new LinkedList<String>();
    try {// ww w . ja v  a2  s .c  om
        final TokenStream tokenStream = analyzer.tokenStream(null, text);
        tokenStream.reset();
        final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            final String term = attribute.toString();
            terms.add(term);
        }
        tokenStream.end();
        tokenStream.close();
    } catch (final IOException e) {
        // Cannot be thrown due to usage of a StringReader
    }
    return terms;
}

From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java

License:Apache License

/**
 * @param contentText input text to be parsed into terms
 * @return salient terms in order of appearance
 * (or null if this content should be ignored)
 *//*from w w  w.  ja  va2 s .c  om*/
public List<String> getTermList(String contentText) {
    init();
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                // Here we skip runs of position increment markers created
                // by the ShingleFilter for stop words because they skew
                // the clustering/liblinear analysis.
                if (!term.matches("(_ )*_")) {
                    result.add(term);
                }
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:com.searchbox.SuggeterDataStructureBuilder.java

License:Apache License

private String[] getTokens(String fulltext) {
    LinkedList<String> tokens = new LinkedList<String>();
    try {// w  w  w  .  j av  a  2s  . c  om
        TokenStream tokenStream = analyzer.tokenStream(fields[0], new StringReader(fulltext));
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        while (tokenStream.incrementToken()) {
            String token = charTermAttribute.toString();
            tokens.add(token);
        }

    } catch (IOException ex) {
        LOGGER.error("Failure reading tokens from stream", ex);
    }
    return tokens.toArray(new String[0]);
}

From source file:com.searchcode.app.util.CodeAnalyzer.java

License:Open Source License

public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of the TokenStream API";

    CodeAnalyzer analyzer = new CodeAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    try {//ww w .  ja  v  a  2s. c  o  m
        stream.reset();

        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println(termAtt.toString());
        }

        stream.end();
    } finally {
        stream.close();
    }
}

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);//ww  w  .j  a v a2s .c  o m

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        assertThat(new TokenInfo(term.toString(), pos)).isEqualTo(info);
    }/*from   w  w w.j  ava  2 s.co m*/
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);// w  w w .ja va2 s  . com
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.LemmatizingTokenizerDemo.java

License:Apache License

private static void printTokens(TokenStream tokenStream) throws IOException {
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        System.out.println(tokenStream);
    }//  ww  w  .j a  va 2  s .  com
}

From source file:com.shaie.SynonymFilterExample.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymGraphFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }/*  w w w  .  ja  va2 s.  c om*/
    ts.end();
    ts.close();
}

From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java

License:Open Source License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode,
        final int[] expectedPos) throws Exception {
    final TokenStream t = a.tokenStream("", new StringReader(input));
    t.reset();

    assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;//w w  w  .j  a  v  a2  s.com
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    NodeAttribute nodeAtt = null;
    if (expectedNode != null) {
        assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class));
        nodeAtt = t.getAttribute(NodeAttribute.class);
    }

    PositionAttribute posAtt = null;
    if (expectedPos != null) {
        assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class));
        posAtt = t.getAttribute(PositionAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals("i=" + i, expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedNode != null) {
            assertEquals(expectedNode[i], nodeAtt.node());
        }

        if (expectedPos != null) {
            assertEquals(expectedPos[i], posAtt.position());
        }
    }

    assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken());
    t.end();
    t.close();
}