Example usage for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java

License:Open Source License

public List<String> parseTerms(final String text, final Analyzer analyzer) {
    final List<String> terms = new LinkedList<String>();
    try {//  ww  w.  j av  a  2s  . com
        final TokenStream tokenStream = analyzer.tokenStream(null, text);
        tokenStream.reset();
        final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            final String term = attribute.toString();
            terms.add(term);
        }
        tokenStream.end();
        tokenStream.close();
    } catch (final IOException e) {
        // Cannot be thrown due to usage of a StringReader
    }
    return terms;
}

From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java

License:Apache License

/**
 * @param contentText input text to be parsed into terms
 * @return salient terms in order of appearance
 * (or null if this content should be ignored)
 *//*from w  w w .  j a  va2s . com*/
public List<String> getTermList(String contentText) {
    init();
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                // Here we skip runs of position increment markers created
                // by the ShingleFilter for stop words because they skew
                // the clustering/liblinear analysis.
                if (!term.matches("(_ )*_")) {
                    result.add(term);
                }
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:com.searchcode.app.util.CodeAnalyzer.java

License:Open Source License

public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of the TokenStream API";

    CodeAnalyzer analyzer = new CodeAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    try {// w w w.  ja  v a2  s  .c o m
        stream.reset();

        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println(termAtt.toString());
        }

        stream.end();
    } finally {
        stream.close();
    }
}

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/*from  ww  w  .  j  av a 2 s. c  o m*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.SynonymFilterExample.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymGraphFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();// www.j a  va 2  s  .com
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}

From source file:com.sindicetech.siren.analysis.filter.TestURINormalisationFilter.java

License:Open Source License

public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages,
        final String[] expectedTypes) throws Exception {

    assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;// w  w  w  .  j  av a 2  s.  c  o  m
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    t.setReader(new StringReader(input));
    t.reset();

    final TokenStream filter = new URINormalisationFilter(t);

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", filter.incrementToken());

        assertEquals(expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

    }

    assertFalse("end of stream", filter.incrementToken());
    filter.end();
    filter.close();
}

From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java

License:Open Source License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode,
        final int[] expectedPos) throws Exception {
    final TokenStream t = a.tokenStream("", new StringReader(input));
    t.reset();/*from   ww w. j a va2 s .  c o m*/

    assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    NodeAttribute nodeAtt = null;
    if (expectedNode != null) {
        assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class));
        nodeAtt = t.getAttribute(NodeAttribute.class);
    }

    PositionAttribute posAtt = null;
    if (expectedPos != null) {
        assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class));
        posAtt = t.getAttribute(PositionAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals("i=" + i, expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedNode != null) {
            assertEquals(expectedNode[i], nodeAtt.node());
        }

        if (expectedPos != null) {
            assertEquals(expectedPos[i], posAtt.position());
        }
    }

    assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken());
    t.end();
    t.close();
}

From source file:com.sindicetech.siren.analysis.TestConciseJsonAnalyzer.java

License:Open Source License

@Test
public void testNumeric() throws Exception {
    _a.registerDatatype(XSDDatatype.XSD_LONG.toCharArray(), new LongNumericAnalyzer(64));
    final TokenStream t = _a.tokenStream("", new StringReader("{ \"a\" : 12 }"));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    t.reset();//  w  ww  . ja  v a 2 s .  c  om
    assertTrue(t.incrementToken());
    assertTrue(termAtt.toString().startsWith("a:"));
    t.end();
    t.close();
}

From source file:com.sindicetech.siren.solr.analysis.BaseSirenStreamTestCase.java

License:Open Source License

public void assertTokenStreamContents(final TokenStream stream, final String[] expectedImages)
        throws Exception {
    assertTrue("has TermAttribute", stream.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);

    stream.reset();//from   w  ww  . ja v  a 2s.  c  om
    for (int i = 0; i < expectedImages.length; i++) {
        stream.clearAttributes();
        assertTrue("token " + i + " does not exists", stream.incrementToken());

        assertEquals(expectedImages[i], termAtt.toString());
    }

    assertFalse("end of stream", stream.incrementToken());
    stream.end();
    stream.close();
}

From source file:com.stratio.cassandra.index.query.Condition.java

License:Apache License

protected String analyze(String field, String value, ColumnMapper<?> columnMapper) {
    TokenStream source = null;
    try {/*from   w  ww. ja v a  2 s.c  o  m*/
        Analyzer analyzer = columnMapper.analyzer();
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}