Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:tw.com.kyle.luminance.LumWindow.java

private Mappings prepare_mappings(int doc_id, String field) throws IOException {
    List<Integer> pos_list = new ArrayList<>();
    List<Integer> off_list = new ArrayList<>();

    TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, field);
    if (tokenStream == null) {
        return null;
    }//w w w . j av a2  s  . com

    OffsetAttribute offsetAttr = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posincAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();
    int pos_counter = 0;
    while (tokenStream.incrementToken()) {
        pos_list.add(pos_counter);
        off_list.add(offsetAttr.startOffset());
        pos_counter += posincAttr.getPositionIncrement();
    }

    Mappings mappings = new Mappings();
    mappings.off_list = off_list;
    mappings.pos_list = pos_list;
    return mappings;
}

From source file:uib.scratch.AnalyzerUtils.java

public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {

    List<String> result = new ArrayList<String>();
    TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords));

    try {/*  w w w.  ja  v a 2 s.  c om*/
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
    }

    return result;
}

From source file:uk.ac.ebi.mdk.service.query.AbstractLuceneService.java

License:Open Source License

/**
 * Construct a query without using the QueryParser. This is useful when you
 * want to search an field that is analyzed and maintain space's. The token
 * stream is converted into a boolean 'Must Occur' query. For most simple
 * queries this method can be used. The approximate flag allows construction
 * of approximate {@see FuzzyMatch} queries for each token. The similarity
 * for the fuzzy match can be set via the {@see setMinSimilarity(float)}
 * method.//from w  w  w .  ja v  a 2  s.  co m
 *
 * @param text        text to construct the query for
 * @param term        the field to search the text in
 * @param approximate whether to use approximate search
 *
 * @return searchable query
 */
public Query construct(String text, Term term, boolean approximate) {

    StringReader reader = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(term.field(), reader);

    BooleanQuery query = new BooleanQuery();

    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

    try {
        while (stream.incrementToken()) {

            Term termToken = term.createTerm(termAttribute.toString());

            Query subQuery = approximate ? new FuzzyQuery(termToken, getMinSimilarity())
                    : new TermQuery(termToken);

            query.add(subQuery, BooleanClause.Occur.MUST);

        }
    } catch (IOException ex) {
        LOGGER.error("Could not constructing query ", ex);
    }

    return query;
}

From source file:uk.ac.ebi.mdk.service.query.AbstractLuceneService.java

License:Open Source License

public Query construct(String text, Term[] terms, boolean approximate) {

    BooleanQuery query = new BooleanQuery(false);

    for (Term term : terms) {
        StringReader reader = new StringReader(text);
        TokenStream stream = analyzer.tokenStream(term.field(), reader);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        BooleanQuery fieldQuery = new BooleanQuery(false);

        try {//from ww  w. j  av  a  2s  .  com
            while (stream.incrementToken()) {

                Term termToken = term.createTerm(termAttribute.toString());

                Query subQuery = approximate ? new FuzzyQuery(termToken, getMinSimilarity())
                        : new TermQuery(termToken);

                fieldQuery.add(subQuery, BooleanClause.Occur.MUST);

            }
        } catch (IOException ex) {
            LOGGER.error("Could not constructing query ", ex);
        }

        query.add(fieldQuery, BooleanClause.Occur.SHOULD);

    }

    return query;
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java

License:Mozilla Public License

public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[],
        int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
        Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);/* ww w.ja v a2s  . c om*/
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
        assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
        termAtt = ts.getAttribute(CharTermAttribute.class);
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
        assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
        offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
        assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
        typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
        assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
        assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
        posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
        assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
        keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map<Integer, Integer> posToStartOffset = new HashMap<>();
    final Map<Integer, Integer> posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
        // extra safety to enforce, that the state is not preserved and also
        // assign bogus values
        ts.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        if (offsetAtt != null)
            offsetAtt.setOffset(14584724, 24683243);
        if (typeAtt != null)
            typeAtt.setType("bogusType");
        if (posIncrAtt != null)
            posIncrAtt.setPositionIncrement(45987657);
        if (posLengthAtt != null)
            posLengthAtt.setPositionLength(45987653);
        if (keywordAtt != null)
            keywordAtt.setKeyword((i & 1) == 0);

        checkClearAtt.getAndResetClearCalled(); // reset it, because we
        // called clearAttribute()
        // before
        assertTrue("token " + i + " does not exist", ts.incrementToken());
        assertTrue("clearAttributes() was not called correctly in TokenStream chain",
                checkClearAtt.getAndResetClearCalled());

        assertEquals("term " + i, output[i], termAtt.toString());
        if (startOffsets != null) {
            assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
        }
        if (endOffsets != null) {
            assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
        }
        if (types != null) {
            assertEquals("type " + i, types[i], typeAtt.type());
        }
        if (posIncrements != null) {
            assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
        }
        if (posLengths != null) {
            assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength());
        }
        if (keywordAtts != null) {
            assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
        }

        // we can enforce some basic things about a few attributes even if
        // the caller doesn't check:
        if (offsetAtt != null) {
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            if (finalOffset != null) {
                assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
                assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset="
                        + finalOffset.intValue(), endOffset <= finalOffset.intValue());
            }

            if (offsetsAreCorrect) {
                assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset="
                        + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
                lastStartOffset = offsetAtt.startOffset();
            }

            if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
                // Validate offset consistency in the graph, ie
                // all tokens leaving from a certain pos have the
                // same startOffset, and all tokens arriving to a
                // certain pos have the same endOffset:
                final int posInc = posIncrAtt.getPositionIncrement();
                pos += posInc;

                final int posLength = posLengthAtt.getPositionLength();

                if (!posToStartOffset.containsKey(pos)) {
                    // First time we've seen a token leaving from this
                    // position:
                    posToStartOffset.put(pos, startOffset);
                    // System.out.println("  + s " + pos + " -> " +
                    // startOffset);
                } else {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    // System.out.println("  + vs " + pos + " -> " +
                    // startOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToStartOffset.get(pos).intValue(), startOffset);
                }

                final int endPos = pos + posLength;

                if (!posToEndOffset.containsKey(endPos)) {
                    // First time we've seen a token arriving to this
                    // position:
                    posToEndOffset.put(endPos, endOffset);
                    // System.out.println("  + e " + endPos + " -> " +
                    // endOffset);
                } else {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    // System.out.println("  + ve " + endPos + " -> " +
                    // endOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToEndOffset.get(endPos).intValue(), endOffset);
                }
            }
        }
        if (posIncrAtt != null) {
            if (i == 0) {
                assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
            } else {
                assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
            }
        }
        if (posLengthAtt != null) {
            assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
        }
    }

    if (ts.incrementToken()) {
        fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token="
                + termAtt.toString());
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null)
        termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null)
        offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null)
        typeAtt.setType("bogusType");
    if (posIncrAtt != null)
        posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null)
        posLengthAtt.setPositionLength(45987653);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called
    // clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()",
            checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
        assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
        assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
        assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
}

From source file:workTextIndexService.Procesamiento.java

public String normalizar(String texto) {
    resultN = "";
    @SuppressWarnings("deprecation")
    SpanishAnalyzer analyzer = new SpanishAnalyzer(Version.LUCENE_4_10_1);
    try {//from  w  ww. j  av a  2 s.  c o  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(texto));
        stream.reset();
        while (stream.incrementToken()) {
            resultN = resultN + (stream.getAttribute(CharTermAttribute.class).toString()) + " ";
        }
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return resultN.toLowerCase();
}

From source file:ws.project.languagebasedlexiconanalisys.TwitterStreamAnalizer.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();
    try {/*from  w ww  .j  a v a  2  s. c o  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;

}