Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {//from w w w  . ja v a2  s  .  c  om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*/*from  w w  w.  j  a  v a 2s  . c  om*/
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {//from ww w.  ja  v  a 2s . c  o m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*//w  ww.j a v  a2s  .  co  m
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sindice.siren.analysis.filter.TestURINormalisationFilter.java

License:Apache License

public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID,
        final int[] expectedCellID) throws Exception {

    assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class));
    final TermAttribute termAtt = t.getAttribute(TermAttribute.class);

    TypeAttribute typeAtt = null;//from w w  w .  j a  va 2  s  . c om
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    TupleAttribute tupleAtt = null;
    if (expectedTupleID != null) {
        assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class));
        tupleAtt = t.getAttribute(TupleAttribute.class);
    }

    CellAttribute cellAtt = null;
    if (expectedCellID != null) {
        assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class));
        cellAtt = t.getAttribute(CellAttribute.class);
    }

    t.reset(new StringReader(input));
    final TokenStream filter = new URINormalisationFilter(t);

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", filter.incrementToken());

        assertEquals(expectedImages[i], termAtt.term());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedTupleID != null) {
            assertEquals(expectedTupleID[i], tupleAtt.tuple());
        }

        if (expectedCellID != null) {
            assertEquals(expectedCellID[i], cellAtt.cell());
        }
    }

    assertFalse("end of stream", filter.incrementToken());
    filter.end();
}

From source file:org.sindice.siren.analysis.TestTupleAnalyzer.java

License:Apache License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID,
        final int[] expectedCellID) throws Exception {
    final TokenStream t = a.reusableTokenStream("", new StringReader(input));

    assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class));
    final TermAttribute termAtt = t.getAttribute(TermAttribute.class);

    TypeAttribute typeAtt = null;/* w  ww  .ja  v a2 s  . c om*/
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    TupleAttribute tupleAtt = null;
    if (expectedTupleID != null) {
        assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class));
        tupleAtt = t.getAttribute(TupleAttribute.class);
    }

    CellAttribute cellAtt = null;
    if (expectedCellID != null) {
        assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class));
        cellAtt = t.getAttribute(CellAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals(expectedImages[i], termAtt.term());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedTupleID != null) {
            assertEquals(expectedTupleID[i], tupleAtt.tuple());
        }

        if (expectedCellID != null) {
            assertEquals(expectedCellID[i], cellAtt.cell());
        }
    }

    assertFalse("end of stream", t.incrementToken());
    t.end();
    t.close();
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, s);
    CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
    ts.reset();/*  ww w. j  a va 2s.  c o m*/
    while (ts.incrementToken()) {
        set.add(cattr.toString());
    }
    ts.end();
    ts.close();
}

From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java

License:Apache License

private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue,
        TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException {
    //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true);
    TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue);
    stream.reset();/*from   w ww . j  ava  2  s. c o  m*/

    int defaultInc = 1;

    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    OffsetAttribute offsetAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    PositionIncrementAttribute incAtt = null;
    if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
        incAtt = stream
                .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
    }

    while (stream.incrementToken()) {

        //Do we need this?
        if (incAtt != null && incAtt.getPositionIncrement() == 0) {
            continue;
        }

        currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
        if (requests.contains(currInd)) {
            results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase,
                    termAtt.toString());
        }
        if (currInd > requests.getLast()) {
            // TODO: Is there a way to avoid this? Or, is this
            // an imaginary performance hit?
            while (stream.incrementToken()) {
                //NO-OP
            }
            stream.end();
            stream.close();
            return GOT_ALL_REQUESTS;
        }
    }
    stream.end();
    stream.close();
    return currInd;
}

From source file:org.tallison.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil.java

License:Apache License

/**
 * allows reuse of terms, this method calls terms.clear() before adding new
 * terms//from   w  ww  .  jav  a  2  s. co m
 *
 * @param s        string to analyze
 * @param field    to use in analysis
 * @param analyzer analyzer
 * @param terms    list for reuse
 * @return list of strings
 * @throws java.io.IOException if there's an IOException during analysis
 */
public static List<String> getTermStrings(String s, String field, Analyzer analyzer, List<String> terms)
        throws IOException {
    if (terms == null) {
        terms = new ArrayList<>();
    }
    terms.clear();
    TokenStream stream = analyzer.tokenStream(field, s);
    stream.reset();
    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);

    while (stream.incrementToken()) {
        terms.add(termAtt.toString());
    }
    stream.end();
    stream.close();

    return terms;
}

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testBasicNoUnigrams() throws Exception {
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s);
    tokenStream.reset();/*  w ww .  j  a va 2  s. c o  m*/
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", });

    List<String> returned = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        assertEquals(1, posIncAttribute.getPositionIncrement());
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}