Example usage for org.apache.lucene.analysis TokenStream hasAttribute

List of usage examples for org.apache.lucene.analysis TokenStream hasAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream hasAttribute.

Prototype

public final boolean hasAttribute(Class<? extends Attribute> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*//from   ww  w  .  j a v a2s. co m
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {/*from   www. ja v a 2  s .c  o  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*//from w w w .j  a  v  a2  s. c  o m
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Split the string into tokens using the given analyzer.
 *///from ww w  .j a  va 2 s  . c o  m
public static final List<String> getTokenTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<String> result = new ArrayList<String>();

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    result.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        result.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

public static final List<List<String>> getPhraseTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<List<String>> result = new LinkedList<List<String>>();
    List<String> curPhrase = new ArrayList<String>();
    result.add(curPhrase);//from w w w.j  av  a2 s . co m

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                boolean incPhrase = true;
                if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                    final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                            .getAttribute(OffsetAttribute.class);
                    if (offsetAttribute.startOffset() == lastEndOffset) {
                        incPhrase = false;
                    }
                    lastEndOffset = offsetAttribute.endOffset();
                }

                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    if (incPhrase && curPhrase.size() > 0) {
                        curPhrase = new ArrayList<String>();
                        result.add(curPhrase);
                    }

                    curPhrase.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        curPhrase.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Build a phrase query from the tokens in the given string using the given
 * analyzer./*from ww  w  .j  a  v  a2 s  .  co m*/
 * <p>
 * Use a BooleanClause.Occur.MUST for exact matches and BooleanClause.Occur.SHOULD
 * for fuzzy matches.
 */
public static final Query toQuery(Analyzer analyzer, String fieldName, String string,
        Collection<String> termCollector, BooleanClause.Occur occur) {
    Query result = null;

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        BooleanQuery booleanQuery = null;
        PhraseQuery phraseQuery = null;
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    final String term = termAttribute.term();

                    // check offset attribute
                    if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                        final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                                .getAttribute(OffsetAttribute.class);
                        if (offsetAttribute.startOffset() != lastEndOffset) {
                            // time to increment phrase
                            if (phraseQuery != null) {
                                if (booleanQuery == null)
                                    booleanQuery = new BooleanQuery();
                                booleanQuery.add(phraseQuery, occur);
                                phraseQuery = null;
                            }
                        }
                        lastEndOffset = offsetAttribute.endOffset();
                    }

                    if (phraseQuery == null)
                        phraseQuery = new PhraseQuery();
                    phraseQuery.add(new Term(fieldName, term));
                    if (termCollector != null)
                        termCollector.add(term);
                }
            }
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }

        if (phraseQuery != null) {
            if (booleanQuery == null)
                booleanQuery = new BooleanQuery();
            booleanQuery.add(phraseQuery, BooleanClause.Occur.SHOULD);
        }
        result = booleanQuery;
    }

    if (result == null) {
        result = new TermQuery(new Term(fieldName, string));
        if (termCollector != null)
            termCollector.add(string);
    }

    return result;
}

From source file:org.sindice.siren.analysis.TestTupleAnalyzer.java

License:Apache License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID,
        final int[] expectedCellID) throws Exception {
    final TokenStream t = a.reusableTokenStream("", new StringReader(input));

    assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class));
    final TermAttribute termAtt = t.getAttribute(TermAttribute.class);

    TypeAttribute typeAtt = null;//  ww w.  j a va2s .  c  o  m
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    TupleAttribute tupleAtt = null;
    if (expectedTupleID != null) {
        assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class));
        tupleAtt = t.getAttribute(TupleAttribute.class);
    }

    CellAttribute cellAtt = null;
    if (expectedCellID != null) {
        assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class));
        cellAtt = t.getAttribute(CellAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals(expectedImages[i], termAtt.term());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedTupleID != null) {
            assertEquals(expectedTupleID[i], tupleAtt.tuple());
        }

        if (expectedCellID != null) {
            assertEquals(expectedCellID[i], cellAtt.cell());
        }
    }

    assertFalse("end of stream", t.incrementToken());
    t.end();
    t.close();
}

From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java

License:Apache License

private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue,
        TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException {
    //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true);
    TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue);
    stream.reset();/*ww w .  j  a  v a2 s .c  om*/

    int defaultInc = 1;

    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    OffsetAttribute offsetAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    PositionIncrementAttribute incAtt = null;
    if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
        incAtt = stream
                .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
    }

    while (stream.incrementToken()) {

        //Do we need this?
        if (incAtt != null && incAtt.getPositionIncrement() == 0) {
            continue;
        }

        currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
        if (requests.contains(currInd)) {
            results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase,
                    termAtt.toString());
        }
        if (currInd > requests.getLast()) {
            // TODO: Is there a way to avoid this? Or, is this
            // an imaginary performance hit?
            while (stream.incrementToken()) {
                //NO-OP
            }
            stream.end();
            stream.close();
            return GOT_ALL_REQUESTS;
        }
    }
    stream.end();
    stream.close();
    return currInd;
}

From source file:perf.TestAnalyzerPerf.java

License:Apache License

private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount)
        throws Exception {
    System.out.println("\nTEST: " + desc);

    // 64 KB buffer
    InputStream is = new FileInputStream(wikiLinesFile);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16);

    long startTime = System.currentTimeMillis();
    long sumTime = 0;
    long hash = 0;
    long tokenCount = 0;
    int totCount = warmupCount + runCount;
    for (int i = 0; i < totCount; i++) {

        boolean isWarmup = i < warmupCount;

        if (i % 10000 == 0) {
            System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...",
                    (System.currentTimeMillis() - startTime) / 1000.0, i));
        }/*  w w w  .j ava2s.com*/
        String s = reader.readLine();
        long t0 = System.nanoTime();
        TokenStream ts = a.tokenStream("field", new StringReader(s));
        ts.reset();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt;
        if (ts.hasAttribute(PositionIncrementAttribute.class)) {
            posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
        } else {
            posIncAtt = null;
        }
        OffsetAttribute offsetAtt;
        if (ts.hasAttribute(OffsetAttribute.class)) {
            offsetAtt = ts.getAttribute(OffsetAttribute.class);
        } else {
            offsetAtt = null;
        }

        while (ts.incrementToken()) {
            hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length());
            if (posIncAtt != null) {
                hash += 31 * posIncAtt.getPositionIncrement();
            }
            if (offsetAtt != null) {
                hash += 31 * offsetAtt.startOffset();
                hash += 31 * offsetAtt.endOffset();
            }
            if (isWarmup == false) {
                tokenCount++;
            }
        }
        ts.end();
        ts.close();

        if (isWarmup == false) {
            sumTime += System.nanoTime() - t0;
        }
    }
    reader.close();

    System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc,
            (sumTime / 1000000.0), hash, tokenCount));
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java

License:Mozilla Public License

public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[],
        int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
        Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);/*from w w w  . ja  v  a2  s  . c  o m*/
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
        assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
        termAtt = ts.getAttribute(CharTermAttribute.class);
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
        assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
        offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
        assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
        typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
        assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
        assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
        posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
        assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
        keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map<Integer, Integer> posToStartOffset = new HashMap<>();
    final Map<Integer, Integer> posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
        // extra safety to enforce, that the state is not preserved and also
        // assign bogus values
        ts.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        if (offsetAtt != null)
            offsetAtt.setOffset(14584724, 24683243);
        if (typeAtt != null)
            typeAtt.setType("bogusType");
        if (posIncrAtt != null)
            posIncrAtt.setPositionIncrement(45987657);
        if (posLengthAtt != null)
            posLengthAtt.setPositionLength(45987653);
        if (keywordAtt != null)
            keywordAtt.setKeyword((i & 1) == 0);

        checkClearAtt.getAndResetClearCalled(); // reset it, because we
        // called clearAttribute()
        // before
        assertTrue("token " + i + " does not exist", ts.incrementToken());
        assertTrue("clearAttributes() was not called correctly in TokenStream chain",
                checkClearAtt.getAndResetClearCalled());

        assertEquals("term " + i, output[i], termAtt.toString());
        if (startOffsets != null) {
            assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
        }
        if (endOffsets != null) {
            assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
        }
        if (types != null) {
            assertEquals("type " + i, types[i], typeAtt.type());
        }
        if (posIncrements != null) {
            assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
        }
        if (posLengths != null) {
            assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength());
        }
        if (keywordAtts != null) {
            assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
        }

        // we can enforce some basic things about a few attributes even if
        // the caller doesn't check:
        if (offsetAtt != null) {
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            if (finalOffset != null) {
                assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
                assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset="
                        + finalOffset.intValue(), endOffset <= finalOffset.intValue());
            }

            if (offsetsAreCorrect) {
                assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset="
                        + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
                lastStartOffset = offsetAtt.startOffset();
            }

            if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
                // Validate offset consistency in the graph, ie
                // all tokens leaving from a certain pos have the
                // same startOffset, and all tokens arriving to a
                // certain pos have the same endOffset:
                final int posInc = posIncrAtt.getPositionIncrement();
                pos += posInc;

                final int posLength = posLengthAtt.getPositionLength();

                if (!posToStartOffset.containsKey(pos)) {
                    // First time we've seen a token leaving from this
                    // position:
                    posToStartOffset.put(pos, startOffset);
                    // System.out.println("  + s " + pos + " -> " +
                    // startOffset);
                } else {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    // System.out.println("  + vs " + pos + " -> " +
                    // startOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToStartOffset.get(pos).intValue(), startOffset);
                }

                final int endPos = pos + posLength;

                if (!posToEndOffset.containsKey(endPos)) {
                    // First time we've seen a token arriving to this
                    // position:
                    posToEndOffset.put(endPos, endOffset);
                    // System.out.println("  + e " + endPos + " -> " +
                    // endOffset);
                } else {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    // System.out.println("  + ve " + endPos + " -> " +
                    // endOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToEndOffset.get(endPos).intValue(), endOffset);
                }
            }
        }
        if (posIncrAtt != null) {
            if (i == 0) {
                assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
            } else {
                assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
            }
        }
        if (posLengthAtt != null) {
            assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
        }
    }

    if (ts.incrementToken()) {
        fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token="
                + termAtt.toString());
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null)
        termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null)
        offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null)
        typeAtt.setType("bogusType");
    if (posIncrAtt != null)
        posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null)
        posLengthAtt.setPositionLength(45987653);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called
    // clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()",
            checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
        assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
        assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
        assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
}