Example usage for org.apache.lucene.analysis TokenStream hasAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream hasAttribute.

Prototype

public final boolean hasAttribute(Class<? extends Attribute> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*//from   ww  w  .  j a v a2s. co m
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {/*from   www. ja v a 2  s .c  o  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*//from w w w .j  a  v  a2  s. c  o m
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Split the string into tokens using the given analyzer.
 *///from ww w  .j a  va 2 s  . c o  m
public static final List<String> getTokenTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<String> result = new ArrayList<String>();

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    result.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        result.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

public static final List<List<String>> getPhraseTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<List<String>> result = new LinkedList<List<String>>();
    List<String> curPhrase = new ArrayList<String>();
    result.add(curPhrase);//from w w w.j  av  a2 s . co m

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                boolean incPhrase = true;
                if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                    final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                            .getAttribute(OffsetAttribute.class);
                    if (offsetAttribute.startOffset() == lastEndOffset) {
                        incPhrase = false;
                    }
                    lastEndOffset = offsetAttribute.endOffset();
                }

                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    if (incPhrase && curPhrase.size() > 0) {
                        curPhrase = new ArrayList<String>();
                        result.add(curPhrase);
                    }

                    curPhrase.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        curPhrase.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Build a phrase query from the tokens in the given string using the given
 * analyzer./*from ww  w  .j  a  v  a2 s  .  co m*/
 * <p>
 * Use a BooleanClause.Occur.MUST for exact matches and BooleanClause.Occur.SHOULD
 * for fuzzy matches.
 */
public static final Query toQuery(Analyzer analyzer, String fieldName, String string,
        Collection<String> termCollector, BooleanClause.Occur occur) {
    Query result = null;

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        BooleanQuery booleanQuery = null;
        PhraseQuery phraseQuery = null;
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    final String term = termAttribute.term();

                    // check offset attribute
                    if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                        final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                                .getAttribute(OffsetAttribute.class);
                        if (offsetAttribute.startOffset() != lastEndOffset) {
                            // time to increment phrase
                            if (phraseQuery != null) {
                                if (booleanQuery == null)
                                    booleanQuery = new BooleanQuery();
                                booleanQuery.add(phraseQuery, occur);
                                phraseQuery = null;
                            }
                        }
                        lastEndOffset = offsetAttribute.endOffset();
                    }

                    if (phraseQuery == null)
                        phraseQuery = new PhraseQuery();
                    phraseQuery.add(new Term(fieldName, term));
                    if (termCollector != null)
                        termCollector.add(term);
                }
            }
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }

        if (phraseQuery != null) {
            if (booleanQuery == null)
                booleanQuery = new BooleanQuery();
            booleanQuery.add(phraseQuery, BooleanClause.Occur.SHOULD);
        }
        result = booleanQuery;
    }

    if (result == null) {
        result = new TermQuery(new Term(fieldName, string));
        if (termCollector != null)
            termCollector.add(string);
    }

    return result;
}

From source file:org.sindice.siren.analysis.TestTupleAnalyzer.java

License:Apache License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID,
        final int[] expectedCellID) throws Exception {
    final TokenStream t = a.reusableTokenStream("", new StringReader(input));

    assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class));
    final TermAttribute termAtt = t.getAttribute(TermAttribute.class);

    TypeAttribute typeAtt = null;//  ww w.  j a va2s .  c  o  m
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    TupleAttribute tupleAtt = null;
    if (expectedTupleID != null) {
        assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class));
        tupleAtt = t.getAttribute(TupleAttribute.class);
    }

    CellAttribute cellAtt = null;
    if (expectedCellID != null) {
        assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class));
        cellAtt = t.getAttribute(CellAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals(expectedImages[i], termAtt.term());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedTupleID != null) {
            assertEquals(expectedTupleID[i], tupleAtt.tuple());
        }

        if (expectedCellID != null) {
            assertEquals(expectedCellID[i], cellAtt.cell());
        }
    }

    assertFalse("end of stream", t.incrementToken());
    t.end();
    t.close();
}

From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java

License:Apache License

private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue,
        TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException {
    //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true);
    TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue);
    stream.reset();/*ww w .  j  a  v a2 s .c  om*/

    int defaultInc = 1;

    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    OffsetAttribute offsetAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    PositionIncrementAttribute incAtt = null;
    if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
        incAtt = stream
                .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
    }

    while (stream.incrementToken()) {

        //Do we need this?
        if (incAtt != null && incAtt.getPositionIncrement() == 0) {
            continue;
        }

        currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
        if (requests.contains(currInd)) {
            results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase,
                    termAtt.toString());
        }
        if (currInd > requests.getLast()) {
            // TODO: Is there a way to avoid this? Or, is this
            // an imaginary performance hit?
            while (stream.incrementToken()) {
                //NO-OP
            }
            stream.end();
            stream.close();
            return GOT_ALL_REQUESTS;
        }
    }
    stream.end();
    stream.close();
    return currInd;
}

From source file:perf.TestAnalyzerPerf.java

License:Apache License

private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount)
        throws Exception {
    System.out.println("\nTEST: " + desc);

    // 64 KB buffer
    InputStream is = new FileInputStream(wikiLinesFile);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16);

    long startTime = System.currentTimeMillis();
    long sumTime = 0;
    long hash = 0;
    long tokenCount = 0;
    int totCount = warmupCount + runCount;
    for (int i = 0; i < totCount; i++) {

        boolean isWarmup = i < warmupCount;

        if (i % 10000 == 0) {
            System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...",
                    (System.currentTimeMillis() - startTime) / 1000.0, i));
        }/*  w w w  .j ava2s.com*/
        String s = reader.readLine();
        long t0 = System.nanoTime();
        TokenStream ts = a.tokenStream("field", new StringReader(s));
        ts.reset();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt;
        if (ts.hasAttribute(PositionIncrementAttribute.class)) {
            posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
        } else {
            posIncAtt = null;
        }
        OffsetAttribute offsetAtt;
        if (ts.hasAttribute(OffsetAttribute.class)) {
            offsetAtt = ts.getAttribute(OffsetAttribute.class);
        } else {
            offsetAtt = null;
        }

        while (ts.incrementToken()) {
            hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length());
            if (posIncAtt != null) {
                hash += 31 * posIncAtt.getPositionIncrement();
            }
            if (offsetAtt != null) {
                hash += 31 * offsetAtt.startOffset();
                hash += 31 * offsetAtt.endOffset();
            }
            if (isWarmup == false) {
                tokenCount++;
            }
        }
        ts.end();
        ts.close();

        if (isWarmup == false) {
            sumTime += System.nanoTime() - t0;
        }
    }
    reader.close();

    System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc,
            (sumTime / 1000000.0), hash, tokenCount));
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java

License:Mozilla Public License

public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[],
        int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
        Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);/*from w w w  . ja  v  a2  s  . c  o m*/
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
        assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
        termAtt = ts.getAttribute(CharTermAttribute.class);
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
        assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
        offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
        assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
        typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
        assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
        assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
        posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
        assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
        keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map<Integer, Integer> posToStartOffset = new HashMap<>();
    final Map<Integer, Integer> posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
        // extra safety to enforce, that the state is not preserved and also
        // assign bogus values
        ts.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        if (offsetAtt != null)
            offsetAtt.setOffset(14584724, 24683243);
        if (typeAtt != null)
            typeAtt.setType("bogusType");
        if (posIncrAtt != null)
            posIncrAtt.setPositionIncrement(45987657);
        if (posLengthAtt != null)
            posLengthAtt.setPositionLength(45987653);
        if (keywordAtt != null)
            keywordAtt.setKeyword((i & 1) == 0);

        checkClearAtt.getAndResetClearCalled(); // reset it, because we
        // called clearAttribute()
        // before
        assertTrue("token " + i + " does not exist", ts.incrementToken());
        assertTrue("clearAttributes() was not called correctly in TokenStream chain",
                checkClearAtt.getAndResetClearCalled());

        assertEquals("term " + i, output[i], termAtt.toString());
        if (startOffsets != null) {
            assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
        }
        if (endOffsets != null) {
            assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
        }
        if (types != null) {
            assertEquals("type " + i, types[i], typeAtt.type());
        }
        if (posIncrements != null) {
            assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
        }
        if (posLengths != null) {
            assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength());
        }
        if (keywordAtts != null) {
            assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
        }

        // we can enforce some basic things about a few attributes even if
        // the caller doesn't check:
        if (offsetAtt != null) {
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            if (finalOffset != null) {
                assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
                assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset="
                        + finalOffset.intValue(), endOffset <= finalOffset.intValue());
            }

            if (offsetsAreCorrect) {
                assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset="
                        + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
                lastStartOffset = offsetAtt.startOffset();
            }

            if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
                // Validate offset consistency in the graph, ie
                // all tokens leaving from a certain pos have the
                // same startOffset, and all tokens arriving to a
                // certain pos have the same endOffset:
                final int posInc = posIncrAtt.getPositionIncrement();
                pos += posInc;

                final int posLength = posLengthAtt.getPositionLength();

                if (!posToStartOffset.containsKey(pos)) {
                    // First time we've seen a token leaving from this
                    // position:
                    posToStartOffset.put(pos, startOffset);
                    // System.out.println("  + s " + pos + " -> " +
                    // startOffset);
                } else {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    // System.out.println("  + vs " + pos + " -> " +
                    // startOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToStartOffset.get(pos).intValue(), startOffset);
                }

                final int endPos = pos + posLength;

                if (!posToEndOffset.containsKey(endPos)) {
                    // First time we've seen a token arriving to this
                    // position:
                    posToEndOffset.put(endPos, endOffset);
                    // System.out.println("  + e " + endPos + " -> " +
                    // endOffset);
                } else {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    // System.out.println("  + ve " + endPos + " -> " +
                    // endOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToEndOffset.get(endPos).intValue(), endOffset);
                }
            }
        }
        if (posIncrAtt != null) {
            if (i == 0) {
                assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
            } else {
                assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
            }
        }
        if (posLengthAtt != null) {
            assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
        }
    }

    if (ts.incrementToken()) {
        fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token="
                + termAtt.toString());
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null)
        termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null)
        offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null)
        typeAtt.setType("bogusType");
    if (posIncrAtt != null)
        posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null)
        posLengthAtt.setPositionLength(45987653);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called
    // clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()",
            checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
        assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
        assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
        assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
}