Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:uk.ac.ebi.fg.biostudies.lucene.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    TermAttribute term = (TermAttribute) stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] "); //B
    }//from   w w w .j  a v  a  2s. co  m
    System.out.println();
}

From source file:uk.ac.ebi.fg.biostudies.lucene.AnalyzerUtils.java

License:Apache License

public static List<String> getAnalyzedTokens(TokenStream stream, boolean sortedAlphabetically)
        throws IOException {

    List<String> result = new ArrayList<String>();
    TermAttribute term = (TermAttribute) stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        result.add(term.term());/*  w w w.j  av a2s.  c o  m*/
    }

    if (sortedAlphabetically) {
        Collections.sort(result);
    }

    return result;
}

From source file:uk.ac.ebi.fg.biostudies.lucene.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    TermAttribute term = (TermAttribute) stream.addAttribute(TermAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] ");
    }/*from   w  ww  .  j  a va 2s.c  om*/
    System.out.println();
}

From source file:uk.ac.ebi.fg.biostudies.lucene.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));

    TermAttribute term = (TermAttribute) // #B
    stream.addAttribute(TermAttribute.class); // #B
    OffsetAttribute offset = (OffsetAttribute) // #B
    stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = (TypeAttribute) // #B
    stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C
        System.out.print("[" + // #E
                term.term() + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }/*from www. j  a  v a2 s.c om*/
    System.out.println();
}

From source file:uk.co.flax.luwak.assertions.TokenStreamAssert.java

License:Apache License

protected TokenStreamAssert(TokenStream actual) throws IOException {
    super(actual, TokenStreamAssert.class);
    termAtt = actual.addAttribute(CharTermAttribute.class);
    posIncAtt = actual.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = actual.addAttribute(OffsetAttribute.class);
    actual.reset();//w w  w  . ja v a2  s.c  o  m
    actualpos = 0;
}

From source file:uk.co.flax.luwak.presearcher.TermFilteredPresearcher.java

License:Apache License

@Override
public final Query buildQuery(LeafReader reader, QueryTermFilter queryTermFilter) {
    try {/*from w  ww .java 2 s . c om*/
        DocumentQueryBuilder queryBuilder = getQueryBuilder();
        for (String field : reader.fields()) {

            TokenStream ts = new TermsEnumTokenStream(reader.terms(field).iterator());
            for (PresearcherComponent component : components) {
                ts = component.filterDocumentTokens(field, ts);
            }

            ts = new BytesRefFilteredTokenFilter(ts, queryTermFilter.getTerms(field));

            TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
            while (ts.incrementToken()) {
                queryBuilder.addTerm(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
            }

        }
        Query presearcherQuery = queryBuilder.build();

        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        bq.add(presearcherQuery, BooleanClause.Occur.SHOULD);
        bq.add(new TermQuery(new Term(ANYTOKEN_FIELD, ANYTOKEN)), BooleanClause.Occur.SHOULD);
        presearcherQuery = bq.build();

        for (PresearcherComponent component : components) {
            presearcherQuery = component.adjustPresearcherQuery(reader, presearcherQuery);
        }

        return presearcherQuery;
    } catch (IOException e) {
        // We're a MemoryIndex, so this shouldn't happen...
        throw new RuntimeException(e);
    }
}

From source file:uk.co.flax.luwak.testutils.TokenStreamUtils.java

License:Apache License

public static void dumpTokenStream(TokenStream ts) throws IOException {
    ts.reset();//www.j  ava2  s  .c om
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
        System.out.println(termAtt.toString());
    }
}

From source file:uk.co.nickthecoder.pinkwino.metadata.LuceneMetaData.java

License:Open Source License

public String analyzeWord(String word) {
    Reader reader = new StringReader(word);
    TokenStream tokenStream = null;
    try {/*from  w  ww.  j  a  v  a 2  s. c  o m*/
        tokenStream = _analyzer.tokenStream("content", reader);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            return charTermAttribute.toString();
        }
    } catch (Exception e) {
        _logger.error("Failed to filter a keyword. " + e);
    } finally {
        try {
            if (tokenStream != null) {
                tokenStream.end();
                tokenStream.close();
            }
            reader.close();
        } catch (Exception e) {
            // Do nothing
            _logger.error("Failed to close during analyzeWord " + e);
        }
    }
    return null;
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java

License:Mozilla Public License

public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[],
        int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
        Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);/*  ww  w.ja v  a 2 s .c  om*/
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
        assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
        termAtt = ts.getAttribute(CharTermAttribute.class);
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
        assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
        offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
        assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
        typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
        assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
        assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
        posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
        assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
        keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map<Integer, Integer> posToStartOffset = new HashMap<>();
    final Map<Integer, Integer> posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
        // extra safety to enforce, that the state is not preserved and also
        // assign bogus values
        ts.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        if (offsetAtt != null)
            offsetAtt.setOffset(14584724, 24683243);
        if (typeAtt != null)
            typeAtt.setType("bogusType");
        if (posIncrAtt != null)
            posIncrAtt.setPositionIncrement(45987657);
        if (posLengthAtt != null)
            posLengthAtt.setPositionLength(45987653);
        if (keywordAtt != null)
            keywordAtt.setKeyword((i & 1) == 0);

        checkClearAtt.getAndResetClearCalled(); // reset it, because we
        // called clearAttribute()
        // before
        assertTrue("token " + i + " does not exist", ts.incrementToken());
        assertTrue("clearAttributes() was not called correctly in TokenStream chain",
                checkClearAtt.getAndResetClearCalled());

        assertEquals("term " + i, output[i], termAtt.toString());
        if (startOffsets != null) {
            assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
        }
        if (endOffsets != null) {
            assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
        }
        if (types != null) {
            assertEquals("type " + i, types[i], typeAtt.type());
        }
        if (posIncrements != null) {
            assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
        }
        if (posLengths != null) {
            assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength());
        }
        if (keywordAtts != null) {
            assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
        }

        // we can enforce some basic things about a few attributes even if
        // the caller doesn't check:
        if (offsetAtt != null) {
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            if (finalOffset != null) {
                assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
                assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset="
                        + finalOffset.intValue(), endOffset <= finalOffset.intValue());
            }

            if (offsetsAreCorrect) {
                assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset="
                        + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
                lastStartOffset = offsetAtt.startOffset();
            }

            if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
                // Validate offset consistency in the graph, ie
                // all tokens leaving from a certain pos have the
                // same startOffset, and all tokens arriving to a
                // certain pos have the same endOffset:
                final int posInc = posIncrAtt.getPositionIncrement();
                pos += posInc;

                final int posLength = posLengthAtt.getPositionLength();

                if (!posToStartOffset.containsKey(pos)) {
                    // First time we've seen a token leaving from this
                    // position:
                    posToStartOffset.put(pos, startOffset);
                    // System.out.println("  + s " + pos + " -> " +
                    // startOffset);
                } else {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    // System.out.println("  + vs " + pos + " -> " +
                    // startOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToStartOffset.get(pos).intValue(), startOffset);
                }

                final int endPos = pos + posLength;

                if (!posToEndOffset.containsKey(endPos)) {
                    // First time we've seen a token arriving to this
                    // position:
                    posToEndOffset.put(endPos, endOffset);
                    // System.out.println("  + e " + endPos + " -> " +
                    // endOffset);
                } else {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    // System.out.println("  + ve " + endPos + " -> " +
                    // endOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToEndOffset.get(endPos).intValue(), endOffset);
                }
            }
        }
        if (posIncrAtt != null) {
            if (i == 0) {
                assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
            } else {
                assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
            }
        }
        if (posLengthAtt != null) {
            assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
        }
    }

    if (ts.incrementToken()) {
        fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token="
                + termAtt.toString());
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null)
        termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null)
        offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null)
        typeAtt.setType("bogusType");
    if (posIncrAtt != null)
        posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null)
        posLengthAtt.setPositionLength(45987653);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called
    // clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()",
            checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
        assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
        assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
        assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
}

From source file:varaha.text.TokenizeText.java

License:Apache License

/**
   Fills a DataBag with tokens from a TokenStream
 *//*from  w ww.j  a v  a2 s  .  c  o  m*/
public DataBag fillBag(TokenStream stream) throws IOException {
    DataBag result = bagFactory.newDefaultBag();
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    try {
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttribute.length() > 0) {
                Tuple termText = tupleFactory.newTuple(termAttribute.toString());
                result.add(termText);
            }
        }
        stream.end();
    } finally {
        stream.close();
    }
    return result;
}