Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:tw.com.kyle.luminance.LumWindow.java

private Mappings prepare_mappings(int doc_id, String field) throws IOException {
    List<Integer> pos_list = new ArrayList<>();
    List<Integer> off_list = new ArrayList<>();

    TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, field);
    if (tokenStream == null) {
        return null;
    }//w w w . j av a2  s  . com

    OffsetAttribute offsetAttr = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posincAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();
    int pos_counter = 0;
    while (tokenStream.incrementToken()) {
        pos_list.add(pos_counter);
        off_list.add(offsetAttr.startOffset());
        pos_counter += posincAttr.getPositionIncrement();
    }

    Mappings mappings = new Mappings();
    mappings.off_list = off_list;
    mappings.pos_list = pos_list;
    return mappings;
}

From source file:uib.scratch.AnalyzerUtils.java

public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {

    List<String> result = new ArrayList<String>();
    TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords));

    try {/*  w w w.  ja  v a 2 s.  c om*/
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
    }

    return result;
}

From source file:uk.ac.ebi.mdk.service.query.AbstractLuceneService.java

License:Open Source License

/**
 * Construct a query without using the QueryParser. This is useful when you
 * want to search an field that is analyzed and maintain space's. The token
 * stream is converted into a boolean 'Must Occur' query. For most simple
 * queries this method can be used. The approximate flag allows construction
 * of approximate {@see FuzzyMatch} queries for each token. The similarity
 * for the fuzzy match can be set via the {@see setMinSimilarity(float)}
 * method.//from w  w  w .  ja v  a 2  s.  co m
 *
 * @param text        text to construct the query for
 * @param term        the field to search the text in
 * @param approximate whether to use approximate search
 *
 * @return searchable query
 */
public Query construct(String text, Term term, boolean approximate) {

    StringReader reader = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(term.field(), reader);

    BooleanQuery query = new BooleanQuery();

    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

    try {
        while (stream.incrementToken()) {

            Term termToken = term.createTerm(termAttribute.toString());

            Query subQuery = approximate ? new FuzzyQuery(termToken, getMinSimilarity())
                    : new TermQuery(termToken);

            query.add(subQuery, BooleanClause.Occur.MUST);

        }
    } catch (IOException ex) {
        LOGGER.error("Could not constructing query ", ex);
    }

    return query;
}

From source file:uk.ac.ebi.mdk.service.query.AbstractLuceneService.java

License:Open Source License

public Query construct(String text, Term[] terms, boolean approximate) {

    BooleanQuery query = new BooleanQuery(false);

    for (Term term : terms) {
        StringReader reader = new StringReader(text);
        TokenStream stream = analyzer.tokenStream(term.field(), reader);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        BooleanQuery fieldQuery = new BooleanQuery(false);

        try {//from ww  w. j  av  a  2s  .  com
            while (stream.incrementToken()) {

                Term termToken = term.createTerm(termAttribute.toString());

                Query subQuery = approximate ? new FuzzyQuery(termToken, getMinSimilarity())
                        : new TermQuery(termToken);

                fieldQuery.add(subQuery, BooleanClause.Occur.MUST);

            }
        } catch (IOException ex) {
            LOGGER.error("Could not constructing query ", ex);
        }

        query.add(fieldQuery, BooleanClause.Occur.SHOULD);

    }

    return query;
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java

License:Mozilla Public License

public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[],
        int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
        Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);/* ww w.ja v a2s  . c om*/
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
        assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
        termAtt = ts.getAttribute(CharTermAttribute.class);
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
        assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
        offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
        assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
        typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
        assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
        assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
        posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
        assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
        keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map<Integer, Integer> posToStartOffset = new HashMap<>();
    final Map<Integer, Integer> posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
        // extra safety to enforce, that the state is not preserved and also
        // assign bogus values
        ts.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        if (offsetAtt != null)
            offsetAtt.setOffset(14584724, 24683243);
        if (typeAtt != null)
            typeAtt.setType("bogusType");
        if (posIncrAtt != null)
            posIncrAtt.setPositionIncrement(45987657);
        if (posLengthAtt != null)
            posLengthAtt.setPositionLength(45987653);
        if (keywordAtt != null)
            keywordAtt.setKeyword((i & 1) == 0);

        checkClearAtt.getAndResetClearCalled(); // reset it, because we
        // called clearAttribute()
        // before
        assertTrue("token " + i + " does not exist", ts.incrementToken());
        assertTrue("clearAttributes() was not called correctly in TokenStream chain",
                checkClearAtt.getAndResetClearCalled());

        assertEquals("term " + i, output[i], termAtt.toString());
        if (startOffsets != null) {
            assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
        }
        if (endOffsets != null) {
            assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
        }
        if (types != null) {
            assertEquals("type " + i, types[i], typeAtt.type());
        }
        if (posIncrements != null) {
            assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
        }
        if (posLengths != null) {
            assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength());
        }
        if (keywordAtts != null) {
            assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
        }

        // we can enforce some basic things about a few attributes even if
        // the caller doesn't check:
        if (offsetAtt != null) {
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            if (finalOffset != null) {
                assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
                assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset="
                        + finalOffset.intValue(), endOffset <= finalOffset.intValue());
            }

            if (offsetsAreCorrect) {
                assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset="
                        + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
                lastStartOffset = offsetAtt.startOffset();
            }

            if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
                // Validate offset consistency in the graph, ie
                // all tokens leaving from a certain pos have the
                // same startOffset, and all tokens arriving to a
                // certain pos have the same endOffset:
                final int posInc = posIncrAtt.getPositionIncrement();
                pos += posInc;

                final int posLength = posLengthAtt.getPositionLength();

                if (!posToStartOffset.containsKey(pos)) {
                    // First time we've seen a token leaving from this
                    // position:
                    posToStartOffset.put(pos, startOffset);
                    // System.out.println("  + s " + pos + " -> " +
                    // startOffset);
                } else {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    // System.out.println("  + vs " + pos + " -> " +
                    // startOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToStartOffset.get(pos).intValue(), startOffset);
                }

                final int endPos = pos + posLength;

                if (!posToEndOffset.containsKey(endPos)) {
                    // First time we've seen a token arriving to this
                    // position:
                    posToEndOffset.put(endPos, endOffset);
                    // System.out.println("  + e " + endPos + " -> " +
                    // endOffset);
                } else {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    // System.out.println("  + ve " + endPos + " -> " +
                    // endOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToEndOffset.get(endPos).intValue(), endOffset);
                }
            }
        }
        if (posIncrAtt != null) {
            if (i == 0) {
                assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
            } else {
                assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
            }
        }
        if (posLengthAtt != null) {
            assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
        }
    }

    if (ts.incrementToken()) {
        fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token="
                + termAtt.toString());
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null)
        termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null)
        offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null)
        typeAtt.setType("bogusType");
    if (posIncrAtt != null)
        posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null)
        posLengthAtt.setPositionLength(45987653);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called
    // clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()",
            checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
        assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
        assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
        assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
}

From source file:workTextIndexService.Procesamiento.java

public String normalizar(String texto) {
    resultN = "";
    @SuppressWarnings("deprecation")
    SpanishAnalyzer analyzer = new SpanishAnalyzer(Version.LUCENE_4_10_1);
    try {//from  w  ww. j  av a  2 s.  c o  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(texto));
        stream.reset();
        while (stream.incrementToken()) {
            resultN = resultN + (stream.getAttribute(CharTermAttribute.class).toString()) + " ";
        }
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return resultN.toLowerCase();
}

From source file:ws.project.languagebasedlexiconanalisys.TwitterStreamAnalizer.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();
    try {/*from  w ww  .j  a v a  2  s. c o  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;

}