Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Filter prefixFilter(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real filter
    TokenStream tok = null;
    try {// www .  j a  va  2  s  .  c  om
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanFilter f = new BooleanFilter();
    try {
        int remainingSize = sizeIsVariable ? 0 : sizeValue; // note: prefixes are not included
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            if (termAtt.length() < 1 + chunkLength) {
                if (remainingSize > 0) { // implies size is fixed
                    if (remainingSize < chunkLength)
                        f.add(new PrefixLengthFilter(term, 1 + remainingSize, 1 + remainingSize),
                                BooleanClause.Occur.MUST);
                    else
                        f.add(new PrefixLengthFilter(term, 1 + chunkLength, 1 + chunkLength),
                                BooleanClause.Occur.MUST);
                } else { // varying size: only limit to the chunkLength
                    f.add(new PrefixLengthFilter(term, 0, 1 + chunkLength), BooleanClause.Occur.MUST);
                }
            } else {
                f.add(new TermFilter(term), BooleanClause.Occur.MUST);
            }
            remainingSize -= termAtt.length() - 1; // termAtt contains the prefix, remainingSize doesn't take it into account
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        f = null;
    }
    return f;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Filter rangeFilter(String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper,
        @Nullable QueryParseContext context) {
    // Special case: -infinity to +infinity
    if (lowerTerm == null && upperTerm == null) {
        if (sizeIsVariable)
            return null;
        StringBuilder sbWildcardPart = new StringBuilder();
        for (int i = 0; i < chunkLength; i++)
            sbWildcardPart.append(wildcardOne);
        String wildcardPart = sbWildcardPart.toString();
        BooleanFilter filter = new BooleanFilter();
        for (int i = sizeValue / chunkLength - 1; i >= 0; i--) {
            filter.add(new WildcardFilter(names().createIndexNameTerm(prefixes.charAt(i) + wildcardPart)),
                    BooleanClause.Occur.MUST);
        }//from   ww  w . j ava2  s . co  m
        if (sizeValue % chunkLength != 0) {
            // If the size is not dividible by chunkLength,
            // we still have a last chunk, but that has a shorter length
            filter.add(
                    new WildcardFilter(names().createIndexNameTerm(prefixes.charAt(sizeValue / chunkLength + 1)
                            + wildcardPart.substring(0, sizeValue % chunkLength))),
                    BooleanClause.Occur.MUST);
        }
        return filter;
    }
    // Check for emptyness
    if (lowerTerm != null && upperTerm != null) {
        int cmp = lowerTerm.compareTo(upperTerm);
        // Bound invertion
        if (cmp > 0)
            return MatchNoDocsFilter.INSTANCE;
        // Equal bounds
        if (cmp == 0) {
            // and both inclusive bounds: singleton
            if (includeLower && includeUpper) {
                // Special case: equal terms
                return fieldFilter(lowerTerm, context);
            }
            // otherwise, empty range
            return MatchNoDocsFilter.INSTANCE;
        }
    }
    // Analyze lower and upper terms
    List<String> lowerTerms = new LinkedList<String>();
    List<String> upperTerms = new LinkedList<String>();
    if (lowerTerm != null) {
        TokenStream tok = null;
        try {
            tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(lowerTerm));
            tok.reset();
        } catch (IOException e) {
            return null;
        }
        CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
        try {
            while (tok.incrementToken())
                lowerTerms.add(termAtt.toString());
            tok.end();
            tok.close();
        } catch (IOException e) {
            return null;
        }
    }
    if (upperTerm != null) {
        TokenStream tok = null;
        try {
            tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(upperTerm));
            tok.reset();
        } catch (IOException e) {
            return null;
        }
        CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
        try {
            while (tok.incrementToken())
                upperTerms.add(termAtt.toString());
            tok.end();
            tok.close();
        } catch (IOException e) {
            return null;
        }
    }
    // Generate the filter
    BooleanFilter topLevelAndFilter = new BooleanFilter();
    Iterator<String> lowers = lowerTerms.iterator();
    Iterator<String> uppers = upperTerms.iterator();
    String currLower = null;
    String currUpper = null;
    int remainingLowerSize = sizeIsVariable ? 0 : sizeValue;
    int remainingUpperSize = sizeIsVariable ? 0 : sizeValue;

    // First, the common prefix
    while (lowers.hasNext() && uppers.hasNext()) {
        currLower = lowers.next();
        currUpper = uppers.next();
        // The last part cannot be part of the prefix
        // because that special case has already been handled
        if (!lowers.hasNext() || !uppers.hasNext())
            break;
        if (!currLower.equals(currUpper))
            break;
        topLevelAndFilter.add(new TermFilter(names().createIndexNameTerm(currLower)), BooleanClause.Occur.MUST);
        remainingLowerSize -= currLower.length() - 1;
        remainingUpperSize -= currUpper.length() - 1;
    }

    String subPrefixLower = currLower;
    BooleanFilter secondLevelOrFilter = new BooleanFilter();
    BooleanFilter lastFilter;
    // Add the range part of the query (secondLevelOrFilter) to the prefix part is already in topLevelAndFilter
    topLevelAndFilter.add(secondLevelOrFilter, BooleanClause.Occur.MUST);
    // We still have secondLevelOrFilter to populate

    lastFilter = new BooleanFilter();
    // Handle the first diverging token of the lowerTerm (if it's not also the last available!)
    if (lowers.hasNext()) {
        lastFilter.add(new TermFilter(names().createIndexNameTerm(currLower)), BooleanClause.Occur.MUST);
        remainingLowerSize -= currLower.length() - 1;
        currLower = lowers.next();
    }
    secondLevelOrFilter.add(lastFilter, BooleanClause.Occur.SHOULD);
    // Then get to the last token of the lowerTerm
    while (lowers.hasNext()) {
        BooleanFilter orFilter = new BooleanFilter();
        lastFilter.add(orFilter, BooleanClause.Occur.MUST);
        orFilter.add(new TermRangeLengthFilter(names().indexName(), currLower, luceneTermUpperBound(currLower),
                false, false, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.SHOULD);
        BooleanFilter nextFilter = new BooleanFilter();
        nextFilter.add(new TermFilter(names().createIndexNameTerm(currLower)), BooleanClause.Occur.MUST);
        orFilter.add(nextFilter, BooleanClause.Occur.SHOULD);
        lastFilter = nextFilter;
        remainingLowerSize -= currLower.length() - 1;
        currLower = lowers.next();
    }
    // Handle the last token of the lowerTerm
    if (remainingLowerSize < 0)
        lastFilter.add(new TermRangeLengthFilter(names().indexName(), currLower,
                luceneTermUpperBound(currLower), includeLower, false, 0, 1 + chunkLength),
                BooleanClause.Occur.MUST);
    else if (remainingLowerSize < chunkLength)
        lastFilter.add(
                new TermRangeLengthFilter(names().indexName(), currLower, luceneTermUpperBound(currLower),
                        includeLower, false, 1 + remainingLowerSize, 1 + remainingLowerSize),
                BooleanClause.Occur.MUST);
    else
        lastFilter.add(new TermRangeLengthFilter(names().indexName(), currLower,
                luceneTermUpperBound(currLower), includeLower, false, 1 + chunkLength, 1 + chunkLength),
                BooleanClause.Occur.MUST);

    // Range from the non prefix part of the lowerTerm to the non prefix part of the upperTerm
    if (remainingUpperSize < 0)
        secondLevelOrFilter.add(new TermRangeLengthFilter(names().indexName(), subPrefixLower, currUpper, false,
                false, 0, 1 + chunkLength), BooleanClause.Occur.SHOULD);
    else if (remainingUpperSize < chunkLength)
        secondLevelOrFilter.add(new TermRangeLengthFilter(names().indexName(), subPrefixLower, currUpper, false,
                false, 1 + remainingUpperSize, 1 + remainingUpperSize), BooleanClause.Occur.SHOULD);
    else
        secondLevelOrFilter.add(new TermRangeLengthFilter(names().indexName(), subPrefixLower, currUpper, false,
                false, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.SHOULD);

    lastFilter = new BooleanFilter();
    // Handle the first diverging token of the upperTerm (if it's not also the last available!)
    if (uppers.hasNext()) {
        lastFilter.add(new TermFilter(names().createIndexNameTerm(currUpper)), BooleanClause.Occur.MUST);
        remainingUpperSize -= currUpper.length() - 1;
        currUpper = uppers.next();
    }
    secondLevelOrFilter.add(lastFilter, BooleanClause.Occur.SHOULD);
    // Then get to the last token of the upperTerm
    while (uppers.hasNext()) {
        BooleanFilter orFilter = new BooleanFilter();
        lastFilter.add(orFilter, BooleanClause.Occur.MUST);
        orFilter.add(new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper), currUpper,
                false, false, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.SHOULD);
        BooleanFilter nextFilter = new BooleanFilter();
        nextFilter.add(new TermFilter(names().createIndexNameTerm(currUpper)), BooleanClause.Occur.MUST);
        orFilter.add(nextFilter, BooleanClause.Occur.SHOULD);
        lastFilter = nextFilter;
        remainingUpperSize -= currUpper.length() - 1;
        currUpper = uppers.next();
    }
    // Handle the last token of the upperTerm
    if (remainingUpperSize < 0)
        lastFilter.add(new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper),
                currUpper, false, includeUpper, 0, 1 + chunkLength), BooleanClause.Occur.MUST);
    else if (remainingUpperSize < chunkLength)
        lastFilter.add(
                new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper), currUpper,
                        false, includeUpper, 1 + remainingUpperSize, 1 + remainingUpperSize),
                BooleanClause.Occur.MUST);
    else
        lastFilter.add(new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper),
                currUpper, false, includeUpper, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.MUST);

    return topLevelAndFilter;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method,
        @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {// www  .  j  av  a 2 s .  com
        tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanQuery q = new BooleanQuery();
    try {
        while (tok.incrementToken()) {
            q.add(new WildcardQuery(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny),
                    BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        q = null;
    }
    return q;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Filter wildcardFilter(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {// ww w  .  j  av a  2  s  .co  m
        tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanFilter f = new BooleanFilter();
    try {
        while (tok.incrementToken()) {
            f.add(new WildcardFilter(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny),
                    BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        f = null;
    }
    return f;
}

From source file:org.elasticsearch.index.mapper.token.AnalyzedTextFieldMapper.java

License:Apache License

static List<String> getAnalyzedText(TokenStream tokenStream) throws IOException {
    try {//from w w  w .j a va 2 s.c o  m
        List<String> analyzedText = new ArrayList<>();
        CharTermAttribute terms = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            analyzedText.add(new String(terms.toString()));
        }
        tokenStream.end();
        return analyzedText;
    } finally {
        tokenStream.close();
    }
}

From source file:org.elasticsearch.index.query.CommonTermsQueryParser.java

License:Apache License

private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName,
        QueryParseContext parseContext, String queryAnalyzer, String lowFreqMinimumShouldMatch,
        String highFreqMinimumShouldMatch) throws IOException {

    FieldMapper<?> mapper = null;/*from   www. j  a v  a 2  s  .co  m*/
    String field;
    MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName);
    if (smartNameFieldMappers != null && smartNameFieldMappers.hasMapper()) {
        mapper = smartNameFieldMappers.mapper();
        field = mapper.names().indexName();
    } else {
        field = fieldName;
    }

    Analyzer analyzer = null;
    if (queryAnalyzer == null) {
        if (mapper != null) {
            analyzer = mapper.searchAnalyzer();
        }
        if (analyzer == null && smartNameFieldMappers != null) {
            analyzer = smartNameFieldMappers.searchAnalyzer();
        }
        if (analyzer == null) {
            analyzer = parseContext.mapperService().searchAnalyzer();
        }
    } else {
        analyzer = parseContext.mapperService().analysisService().analyzer(queryAnalyzer);
        if (analyzer == null) {
            throw new ElasticsearchIllegalArgumentException("No analyzer found for [" + queryAnalyzer + "]");
        }
    }

    // Logic similar to QueryParser#getFieldQuery
    TokenStream source = analyzer.tokenStream(field, queryString.toString());
    int count = 0;
    try {
        source.reset();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        while (source.incrementToken()) {
            BytesRef ref = new BytesRef(termAtt.length() * 4); // oversize for
                                                               // UTF-8
            UnicodeUtil.UTF16toUTF8(termAtt.buffer(), 0, termAtt.length(), ref);
            query.add(new Term(field, ref));
            count++;
        }
    } finally {
        source.close();
    }

    if (count == 0) {
        return null;
    }
    query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
    query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
    return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext);
}

From source file:org.elasticsearch.index.search.TextQueryParser.java

License:Apache License

public Query parse(Type type) {
    FieldMapper mapper = null;/*  w w  w  .  jav a 2s . co  m*/
    String field = fieldName;
    MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName);
    if (smartNameFieldMappers != null) {
        if (smartNameFieldMappers.hasMapper()) {
            mapper = smartNameFieldMappers.mapper();
            if (mapper != null) {
                field = mapper.names().indexName();
            }
        }
    }

    if (mapper != null && mapper.useFieldQueryWithQueryString()) {
        return wrapSmartNameQuery(mapper.fieldQuery(text, parseContext), smartNameFieldMappers, parseContext);
    }

    Analyzer analyzer = null;
    if (this.analyzer == null) {
        if (mapper != null) {
            analyzer = mapper.searchAnalyzer();
        }
        if (analyzer == null) {
            analyzer = parseContext.mapperService().searchAnalyzer();
        }
    } else {
        analyzer = parseContext.mapperService().analysisService().analyzer(this.analyzer);
        if (analyzer == null) {
            throw new ElasticSearchIllegalArgumentException("No analyzer found for [" + this.analyzer + "]");
        }
    }

    // Logic similar to QueryParser#getFieldQuery

    TokenStream source;
    try {
        source = analyzer.reusableTokenStream(field, new FastStringReader(text));
        source.reset();
    } catch (IOException e) {
        source = analyzer.tokenStream(field, new FastStringReader(text));
    }
    CachingTokenFilter buffer = new CachingTokenFilter(source);
    CharTermAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;

    boolean success = false;
    try {
        buffer.reset();
        success = true;
    } catch (IOException e) {
        // success==false if we hit an exception
    }
    if (success) {
        if (buffer.hasAttribute(CharTermAttribute.class)) {
            termAtt = buffer.getAttribute(CharTermAttribute.class);
        }
        if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
            posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
        }
    }

    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    boolean hasMoreTokens = false;
    if (termAtt != null) {
        try {
            hasMoreTokens = buffer.incrementToken();
            while (hasMoreTokens) {
                numTokens++;
                int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
                if (positionIncrement != 0) {
                    positionCount += positionIncrement;
                } else {
                    severalTokensAtSamePosition = true;
                }
                hasMoreTokens = buffer.incrementToken();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    try {
        // rewind the buffer stream
        buffer.reset();

        // close original stream - all tokens buffered
        source.close();
    } catch (IOException e) {
        // ignore
    }

    Term termFactory = new Term(field);
    if (numTokens == 0) {
        return MatchNoDocsQuery.INSTANCE;
    } else if (type == Type.BOOLEAN) {
        if (numTokens == 1) {
            String term = null;
            try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();
            } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
            }
            Query q = newTermQuery(mapper, termFactory.createTerm(term));
            return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
        }
        BooleanQuery q = new BooleanQuery(positionCount == 1);
        for (int i = 0; i < numTokens; i++) {
            String term = null;
            try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();
            } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
            }

            Query currentQuery = newTermQuery(mapper, termFactory.createTerm(term));
            q.add(currentQuery, occur);
        }
        return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
    } else if (type == Type.PHRASE) {
        if (severalTokensAtSamePosition) {
            MultiPhraseQuery mpq = new MultiPhraseQuery();
            mpq.setSlop(phraseSlop);
            List<Term> multiTerms = new ArrayList<Term>();
            int position = -1;
            for (int i = 0; i < numTokens; i++) {
                String term = null;
                int positionIncrement = 1;
                try {
                    boolean hasNext = buffer.incrementToken();
                    assert hasNext == true;
                    term = termAtt.toString();
                    if (posIncrAtt != null) {
                        positionIncrement = posIncrAtt.getPositionIncrement();
                    }
                } catch (IOException e) {
                    // safe to ignore, because we know the number of tokens
                }

                if (positionIncrement > 0 && multiTerms.size() > 0) {
                    if (enablePositionIncrements) {
                        mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
                    } else {
                        mpq.add(multiTerms.toArray(new Term[multiTerms.size()]));
                    }
                    multiTerms.clear();
                }
                position += positionIncrement;
                multiTerms.add(termFactory.createTerm(term));
            }
            if (enablePositionIncrements) {
                mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
            } else {
                mpq.add(multiTerms.toArray(new Term[multiTerms.size()]));
            }
            return wrapSmartNameQuery(mpq, smartNameFieldMappers, parseContext);
        } else {
            PhraseQuery pq = new PhraseQuery();
            pq.setSlop(phraseSlop);
            int position = -1;

            for (int i = 0; i < numTokens; i++) {
                String term = null;
                int positionIncrement = 1;

                try {
                    boolean hasNext = buffer.incrementToken();
                    assert hasNext == true;
                    term = termAtt.toString();
                    if (posIncrAtt != null) {
                        positionIncrement = posIncrAtt.getPositionIncrement();
                    }
                } catch (IOException e) {
                    // safe to ignore, because we know the number of tokens
                }

                if (enablePositionIncrements) {
                    position += positionIncrement;
                    pq.add(termFactory.createTerm(term), position);
                } else {
                    pq.add(termFactory.createTerm(term));
                }
            }
            return wrapSmartNameQuery(pq, smartNameFieldMappers, parseContext);
        }
    } else if (type == Type.PHRASE_PREFIX) {
        MultiPhrasePrefixQuery mpq = new MultiPhrasePrefixQuery();
        mpq.setSlop(phraseSlop);
        mpq.setMaxExpansions(maxExpansions);
        List<Term> multiTerms = new ArrayList<Term>();
        int position = -1;
        for (int i = 0; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();
                if (posIncrAtt != null) {
                    positionIncrement = posIncrAtt.getPositionIncrement();
                }
            } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
            }

            if (positionIncrement > 0 && multiTerms.size() > 0) {
                if (enablePositionIncrements) {
                    mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
                } else {
                    mpq.add(multiTerms.toArray(new Term[multiTerms.size()]));
                }
                multiTerms.clear();
            }
            position += positionIncrement;
            multiTerms.add(termFactory.createTerm(term));
        }
        if (enablePositionIncrements) {
            mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
        } else {
            mpq.add(multiTerms.toArray(new Term[multiTerms.size()]));
        }
        return wrapSmartNameQuery(mpq, smartNameFieldMappers, parseContext);
    }

    throw new ElasticSearchIllegalStateException("No type found for [" + type + "]");
}

From source file:org.elasticsearch.search.aggregations.bucket.significant.SignificantTextAggregator.java

License:Apache License

@Override
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub)
        throws IOException {
    final BytesRefBuilder previous = new BytesRefBuilder();
    return new LeafBucketCollectorBase(sub, null) {

        @Override/*  w  w w  . j  av  a2s  .co m*/
        public void collect(int doc, long bucket) throws IOException {
            collectFromSource(doc, bucket, fieldName, sourceFieldNames);
            numCollectedDocs++;
            if (dupSequenceSpotter != null) {
                dupSequenceSpotter.startNewSequence();
            }
        }

        private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms,
                String fieldText) throws IOException {
            if (dupSequenceSpotter != null) {
                ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
            }
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            try {
                while (ts.incrementToken()) {
                    if (dupSequenceSpotter != null) {
                        long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
                        long growth = newTrieSize - lastTrieSize;
                        // Only update the circuitbreaker after
                        if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
                            addRequestCircuitBreakerBytes(growth);
                            lastTrieSize = newTrieSize;
                        }
                    }
                    previous.clear();
                    previous.copyChars(termAtt);
                    BytesRef bytes = previous.get();
                    if (inDocTerms.add(bytes) >= 0) {
                        if (includeExclude == null || includeExclude.accept(bytes)) {
                            long bucketOrdinal = bucketOrds.add(bytes);
                            if (bucketOrdinal < 0) { // already seen
                                bucketOrdinal = -1 - bucketOrdinal;
                                collectExistingBucket(sub, doc, bucketOrdinal);
                            } else {
                                collectBucket(sub, doc, bucketOrdinal);
                            }
                        }
                    }
                }

            } finally {
                ts.close();
            }
        }

        private void collectFromSource(int doc, long bucket, String indexedFieldName, String[] sourceFieldNames)
                throws IOException {
            MappedFieldType fieldType = context.getQueryShardContext().fieldMapper(indexedFieldName);
            if (fieldType == null) {
                throw new IllegalArgumentException("Aggregation [" + name + "] cannot process field ["
                        + indexedFieldName + "] since it is not present");
            }

            SourceLookup sourceLookup = context.lookup().source();
            sourceLookup.setSegmentAndDocument(ctx, doc);
            BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays());

            try {
                for (String sourceField : sourceFieldNames) {
                    List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField);
                    textsToHighlight = textsToHighlight.stream().map(obj -> {
                        if (obj instanceof BytesRef) {
                            return fieldType.valueForDisplay(obj).toString();
                        } else {
                            return obj;
                        }
                    }).collect(Collectors.toList());

                    Analyzer analyzer = fieldType.indexAnalyzer();
                    for (Object fieldValue : textsToHighlight) {
                        String fieldText = fieldValue.toString();
                        TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
                        processTokenStream(doc, bucket, ts, inDocTerms, fieldText);
                    }
                }
            } finally {
                Releasables.close(inDocTerms);
            }
        }
    };
}

From source file:org.elasticsearch.search.highlight.PlainHighlighter.java

License:Apache License

private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, TokenStream tokenStream)
        throws IOException {
    try {/*from  w w  w. j  a  va  2  s .co  m*/
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    } finally {
        tokenStream.end();
        tokenStream.close();
    }
}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java

License:Apache License

@Test
public void testValidNumberOfExpansions() throws IOException {
    Builder builder = new SynonymMap.Builder(true);
    for (int i = 0; i < 256; i++) {
        builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true);
    }/*from w w  w . java2  s .  c  o  m*/
    StringBuilder valueBuilder = new StringBuilder();
    for (int i = 0; i < 8; i++) {
        valueBuilder.append(i + 1);
        valueBuilder.append(" ");
    }
    MockTokenizer tokenizer = new MockTokenizer(new StringReader(valueBuilder.toString()),
            MockTokenizer.WHITESPACE, true);
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

    TokenStream suggestTokenStream = new CompletionTokenStream(filter,
            new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() {
                @Override
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    Set<IntsRef> finiteStrings = suggester
                            .toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream);
                    return finiteStrings;
                }
            });

    suggestTokenStream.reset();
    ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class);
    PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class);
    int maxPos = 0;
    int count = 0;
    while (suggestTokenStream.incrementToken()) {
        count++;
        assertNotNull(attr.getBytesRef());
        assertTrue(attr.getBytesRef().length > 0);
        maxPos += posAttr.getPositionIncrement();
    }
    suggestTokenStream.close();
    assertEquals(count, 256);
    assertEquals(count, maxPos);

}