Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:org.sindice.siren.qparser.keyword.query.processors.AnalyzerQueryNodeProcessor.java

License:Apache License

@Override
protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeException {

    if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode)
            && !(node instanceof FuzzyQueryNode) && !(node instanceof ParametricQueryNode)) {

        final FieldQueryNode fieldNode = ((FieldQueryNode) node);
        final String text = fieldNode.getTextAsString();
        final String field = fieldNode.getFieldAsString();

        final TokenStream source = this.analyzer.tokenStream(field, new StringReader(text));
        final CachingTokenFilter buffer = new CachingTokenFilter(source);

        PositionIncrementAttribute posIncrAtt = null;
        int numTokens = 0;
        int positionCount = 0;
        boolean severalTokensAtSamePosition = false;

        if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
            posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
        }//from  w  w  w.ja  va 2  s. c  om

        try {

            while (buffer.incrementToken()) {
                numTokens++;
                final int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
                if (positionIncrement != 0) {
                    positionCount += positionIncrement;

                } else {
                    severalTokensAtSamePosition = true;
                }

            }

        } catch (final IOException e) {
            // ignore
        }

        try {
            // rewind the buffer stream
            buffer.reset();

            // close original stream - all tokens buffered
            source.close();
        } catch (final IOException e) {
            // ignore
        }

        if (!buffer.hasAttribute(CharTermAttribute.class)) {
            return new NoTokenFoundQueryNode();
        }

        final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);

        if (numTokens == 0) {
            return new NoTokenFoundQueryNode();

        } else if (numTokens == 1) {
            String term = null;
            try {
                boolean hasNext;
                hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();

            } catch (final IOException e) {
                // safe to ignore, because we know the number of tokens
            }

            fieldNode.setText(term);

            return fieldNode;

        } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
            if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
                // no phrase query:
                final LinkedList<QueryNode> children = new LinkedList<QueryNode>();

                for (int i = 0; i < numTokens; i++) {
                    String term = null;
                    try {
                        final boolean hasNext = buffer.incrementToken();
                        assert hasNext == true;
                        term = termAtt.toString();

                    } catch (final IOException e) {
                        // safe to ignore, because we know the number of tokens
                    }

                    children.add(new FieldQueryNode(field, term, -1, -1));

                }

                // If multiple terms at one single position, this must be a query
                // expansion. Perform a OR between the terms.
                if (severalTokensAtSamePosition && positionCount == 1) {
                    return new GroupQueryNode(new OrQueryNode(children));
                } else if (positionCount == 1) {
                    return new GroupQueryNode(new StandardBooleanQueryNode(children, true));
                } else {
                    return new StandardBooleanQueryNode(children, false);
                }

            } else {
                // phrase query:
                final MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

                final List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
                int position = -1;
                int i = 0;
                int termGroupCount = 0;

                for (; i < numTokens; i++) {
                    String term = null;
                    int positionIncrement = 1;
                    try {
                        final boolean hasNext = buffer.incrementToken();
                        assert hasNext == true;
                        term = termAtt.toString();
                        if (posIncrAtt != null) {
                            positionIncrement = posIncrAtt.getPositionIncrement();
                        }

                    } catch (final IOException e) {
                        // safe to ignore, because we know the number of tokens
                    }

                    if (positionIncrement > 0 && multiTerms.size() > 0) {

                        for (final FieldQueryNode termNode : multiTerms) {

                            if (this.positionIncrementsEnabled) {
                                termNode.setPositionIncrement(position);
                            } else {
                                termNode.setPositionIncrement(termGroupCount);
                            }

                            mpq.add(termNode);
                        }

                        // Only increment once for each "group" of
                        // terms that were in the same position:
                        termGroupCount++;

                        multiTerms.clear();

                    }

                    position += positionIncrement;
                    multiTerms.add(new FieldQueryNode(field, term, -1, -1));

                }

                for (final FieldQueryNode termNode : multiTerms) {

                    if (this.positionIncrementsEnabled) {
                        termNode.setPositionIncrement(position);

                    } else {
                        termNode.setPositionIncrement(termGroupCount);
                    }

                    mpq.add(termNode);

                }

                return mpq;

            }

        } else {

            final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

            int position = -1;

            for (int i = 0; i < numTokens; i++) {
                String term = null;
                int positionIncrement = 1;

                try {
                    final boolean hasNext = buffer.incrementToken();
                    assert hasNext == true;
                    term = termAtt.toString();

                    if (posIncrAtt != null) {
                        positionIncrement = posIncrAtt.getPositionIncrement();
                    }

                } catch (final IOException e) {
                    // safe to ignore, because we know the number of tokens
                }

                final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);

                if (this.positionIncrementsEnabled) {
                    position += positionIncrement;
                    newFieldNode.setPositionIncrement(position);

                } else {
                    newFieldNode.setPositionIncrement(i);
                }

                pq.add(newFieldNode);

            }

            return pq;

        }

    }

    return node;

}

From source file:org.sindice.siren.qparser.ntriple.query.processors.ResourceAnalyzerQueryNodeProcessor.java

License:Apache License

@Override
protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeException {

    if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode)
            && !(node instanceof FuzzyQueryNode) && !(node instanceof ParametricQueryNode)) {

        final FieldQueryNode fieldNode = ((FieldQueryNode) node);
        final String text = fieldNode.getTextAsString();
        final String field = fieldNode.getFieldAsString();

        final TokenStream source = this.analyzer.tokenStream(field, new StringReader(text));
        final CachingTokenFilter buffer = new CachingTokenFilter(source);

        PositionIncrementAttribute posIncrAtt = null;
        int numTokens = 0;
        int positionCount = 0;
        boolean severalTokensAtSamePosition = false;

        if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
            posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
        }//from   www  .  j a  va2s .com

        try {

            while (buffer.incrementToken()) {
                numTokens++;
                final int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
                if (positionIncrement != 0) {
                    positionCount += positionIncrement;

                } else {
                    severalTokensAtSamePosition = true;
                }

            }

        } catch (final IOException e) {
            // ignore
        }

        try {
            // rewind the buffer stream
            buffer.reset();

            // close original stream - all tokens buffered
            source.close();
        } catch (final IOException e) {
            // ignore
        }

        if (!buffer.hasAttribute(CharTermAttribute.class)) {
            return new NoTokenFoundQueryNode();
        }

        final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);

        if (numTokens == 0) {
            return new NoTokenFoundQueryNode();

        } else if (numTokens == 1) {
            String term = null;
            try {
                boolean hasNext;
                hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();

            } catch (final IOException e) {
                // safe to ignore, because we know the number of tokens
            }

            fieldNode.setText(term);

            return fieldNode;

        } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
            if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
                // no phrase query:
                final LinkedList<QueryNode> children = new LinkedList<QueryNode>();

                for (int i = 0; i < numTokens; i++) {
                    String term = null;
                    try {
                        final boolean hasNext = buffer.incrementToken();
                        assert hasNext == true;
                        term = termAtt.toString();

                    } catch (final IOException e) {
                        // safe to ignore, because we know the number of tokens
                    }

                    children.add(new FieldQueryNode(field, term, -1, -1));

                }

                // If multiple terms at one single position, this must be a query
                // expansion. Perform a OR between the terms.
                if (positionCount == 1) {
                    final LinkedList<QueryNode> modified = new LinkedList<QueryNode>();
                    for (final QueryNode child : children) {
                        modified.add(new ModifierQueryNode(child, Modifier.MOD_NONE));
                    }
                    return new GroupQueryNode(new StandardBooleanQueryNode(modified, true));
                }
                // Multiple terms over more than one position.
                // Not able to support such a case. Usually, it is the result of a bad
                // use of filters at query time. Better to throw an exception.
                else {
                    throw new QueryNodeException(new MessageImpl("Multiple terms over more than one position"));
                }

            } else {
                // phrase query:
                final MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

                final List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
                int position = -1;
                int i = 0;
                int termGroupCount = 0;

                for (; i < numTokens; i++) {
                    String term = null;
                    int positionIncrement = 1;
                    try {
                        final boolean hasNext = buffer.incrementToken();
                        assert hasNext == true;
                        term = termAtt.toString();
                        if (posIncrAtt != null) {
                            positionIncrement = posIncrAtt.getPositionIncrement();
                        }

                    } catch (final IOException e) {
                        // safe to ignore, because we know the number of tokens
                    }

                    if (positionIncrement > 0 && multiTerms.size() > 0) {

                        for (final FieldQueryNode termNode : multiTerms) {

                            if (this.positionIncrementsEnabled) {
                                termNode.setPositionIncrement(position);
                            } else {
                                termNode.setPositionIncrement(termGroupCount);
                            }

                            mpq.add(termNode);
                        }

                        // Only increment once for each "group" of
                        // terms that were in the same position:
                        termGroupCount++;

                        multiTerms.clear();

                    }

                    position += positionIncrement;
                    multiTerms.add(new FieldQueryNode(field, term, -1, -1));

                }

                for (final FieldQueryNode termNode : multiTerms) {

                    if (this.positionIncrementsEnabled) {
                        termNode.setPositionIncrement(position);

                    } else {
                        termNode.setPositionIncrement(termGroupCount);
                    }

                    mpq.add(termNode);

                }

                return mpq;

            }

        } else {

            final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

            int position = -1;

            for (int i = 0; i < numTokens; i++) {
                String term = null;
                int positionIncrement = 1;

                try {
                    final boolean hasNext = buffer.incrementToken();
                    assert hasNext == true;
                    term = termAtt.toString();

                    if (posIncrAtt != null) {
                        positionIncrement = posIncrAtt.getPositionIncrement();
                    }

                } catch (final IOException e) {
                    // safe to ignore, because we know the number of tokens
                }

                final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);

                if (this.positionIncrementsEnabled) {
                    position += positionIncrement;
                    newFieldNode.setPositionIncrement(position);

                } else {
                    newFieldNode.setPositionIncrement(i);
                }

                pq.add(newFieldNode);

            }

            return pq;

        }

    }

    return node;

}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, s);
    CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
    ts.reset();/*from   w  w  w. j  a v a2  s  .  c o m*/
    while (ts.incrementToken()) {
        set.add(cattr.toString());
    }
    ts.end();
    ts.close();
}

From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java

License:Apache License

private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue,
        TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException {
    //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true);
    TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue);
    stream.reset();/*from  www  .  j a  va  2 s. c  o  m*/

    int defaultInc = 1;

    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    OffsetAttribute offsetAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    PositionIncrementAttribute incAtt = null;
    if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
        incAtt = stream
                .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
    }

    while (stream.incrementToken()) {

        //Do we need this?
        if (incAtt != null && incAtt.getPositionIncrement() == 0) {
            continue;
        }

        currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
        if (requests.contains(currInd)) {
            results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase,
                    termAtt.toString());
        }
        if (currInd > requests.getLast()) {
            // TODO: Is there a way to avoid this? Or, is this
            // an imaginary performance hit?
            while (stream.incrementToken()) {
                //NO-OP
            }
            stream.end();
            stream.close();
            return GOT_ALL_REQUESTS;
        }
    }
    stream.end();
    stream.close();
    return currInd;
}

From source file:org.tallison.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil.java

License:Apache License

/**
 * allows reuse of terms, this method calls terms.clear() before adding new
 * terms/*from w ww .  ja  v  a  2 s .com*/
 *
 * @param s        string to analyze
 * @param field    to use in analysis
 * @param analyzer analyzer
 * @param terms    list for reuse
 * @return list of strings
 * @throws java.io.IOException if there's an IOException during analysis
 */
public static List<String> getTermStrings(String s, String field, Analyzer analyzer, List<String> terms)
        throws IOException {
    if (terms == null) {
        terms = new ArrayList<>();
    }
    terms.clear();
    TokenStream stream = analyzer.tokenStream(field, s);
    stream.reset();
    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);

    while (stream.incrementToken()) {
        terms.add(termAtt.toString());
    }
    stream.end();
    stream.close();

    return terms;
}

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testBasicNoUnigrams() throws Exception {
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s);
    tokenStream.reset();/*from www. j  a v  a 2 s  . c  om*/
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", });

    List<String> returned = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        assertEquals(1, posIncAttribute.getPositionIncrement());
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testIncludeUnigrams() throws Exception {
    List<String> expected = Arrays.asList(
            new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", });
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream("f", s);
    tokenStream.reset();/*from  www  .  j a  va2s . c om*/
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> returned = new ArrayList<>();
    int i = 0;
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        if (i++ % 2 == 0) {
            assertEquals(1, posIncAttribute.getPositionIncrement());
        } else {
            assertEquals(0, posIncAttribute.getPositionIncrement());
        }
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testCJKNoUnigrams() throws Exception {

    final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET;
    int posIncGap = 10;
    final int charOffsetGap = 10;
    Analyzer analyzer = getCJKBigramAnalyzer(false);
    TokenStream ts = analyzer.tokenStream(FIELD, "");
    ts.reset();/*from   w  ww .  j  a va2 s.  c o  m*/
    CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class);

    ts.end();
    ts.close();
    String[] docs = new String[] { "" };

    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)));
    Query q = new TermQuery(new Term(FIELD, ""));
    //now test straight and span wrapper
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
    searcher.search(indexSearcher, FIELD, q, q, analyzer, collector);
    for (ConcordanceWindow w : collector.getWindows()) {
        //System.out.println(w);
    }
    reader.close();
    directory.close();

}

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java

License:Apache License

public static ArrayList<String> getTopicWord(String str) {
     // IK?smart??
     Analyzer analyzer = new IKAnalyzer(true);
     ArrayList<String> retData = new ArrayList<String>();
     // ?LuceneTokenStream
     TokenStream ts = null;
     try {/* w w  w. j a  v a  2 s . c  o  m*/
         ts = analyzer.tokenStream("myfield", new StringReader(str));
         // ???
         OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
         // ??
         CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
         // ??
         TypeAttribute type = ts.addAttribute(TypeAttribute.class);

         // ?TokenStream?StringReader
         ts.reset();
         // ??
         while (ts.incrementToken()) {
             System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                     + " | " + type.type());
             if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) {
                 retData.add(term.toString());
             }
         }
         // TokenStreamStringReader
         ts.end(); // Perform end-of-stream operations, e.g. set the final
                   // offset.

     } catch (IOException e) {
         e.printStackTrace();
     } finally {
         // TokenStream?
         if (ts != null) {
             try {
                 ts.close();
             } catch (IOException e) {
                 e.printStackTrace();
             }
         }
     }
     return retData;
 }

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*  www .j  a v a2  s.  c  o  m*/
        ts = analyzer.tokenStream("myfield", new StringReader("???"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}