Example usage for org.apache.lucene.search.highlight OffsetLimitTokenFilter OffsetLimitTokenFilter

List of usage examples for org.apache.lucene.search.highlight OffsetLimitTokenFilter OffsetLimitTokenFilter

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight OffsetLimitTokenFilter OffsetLimitTokenFilter.

Prototype

public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) 

Source Link

Usage

From source file:com.o19s.solr.swan.highlight.SwanHighlighter.java

License:Apache License

private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId,
        Document doc, String fieldName) throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField)))
        return;// ww w.  j  a v  a2s.  com
    // END: Hack

    SolrParams params = req.getParams();
    IndexableField[] docFields = doc.getFields(fieldName);
    List<String> listFields = new ArrayList<String>();
    for (IndexableField field : docFields) {
        listFields.add(field.stringValue());
    }

    String[] docTexts = listFields.toArray(new String[listFields.size()]);

    // according to Document javadoc, doc.getValues() never returns null. check empty instead of null
    if (docTexts.length == 0)
        return;

    TokenStream tokenStream;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    try {
        //      TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
        //      if (tvStream != null) {
        //        tots = new TermOffsetsTokenStream(tvStream);
        //      }
    } catch (IllegalArgumentException e) {
        // No problem. But we can't use TermOffsets optimization.
    }

    for (int j = 0; j < docTexts.length; j++) {
        if (tots != null) {
            // if we're using TermOffsets optimization, then get the next
            // field value's TokenStream (i.e. get field j's TokenStream) from tots:
            tokenStream = tots.getMultiValuedTokenStream(docTexts[j].length());
        } else {
            // fall back to analyzer
            tokenStream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
        }

        int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

        Highlighter highlighter;
        if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
            if (maxCharsToAnalyze < 0) {
                tokenStream = new CachingTokenFilter(tokenStream);
            } else {
                tokenStream = new CachingTokenFilter(
                        new OffsetLimitTokenFilter(tokenStream, maxCharsToAnalyze));
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tokenStream);

            // after highlighter initialization, reset tstream since construction of highlighter already used it
            tokenStream.reset();
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        if (maxCharsToAnalyze < 0) {
            highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
        } else {
            highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        }

        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tokenStream, docTexts[j],
                    mergeContiguousFragments, numFragments);
            for (int k = 0; k < bestTextFragments.length; k++) {
                if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                    frags.add(bestTextFragments[k]);
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // sort such that the fragments with the highest score come first
    Collections.sort(frags, new Comparator<TextFragment>() {
        public int compare(TextFragment arg0, TextFragment arg1) {
            return Math.round(arg1.getScore() - arg0.getScore());
        }
    });

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    String[] summaries = null;
    if (frags.size() > 0) {
        ArrayList<String> fragTexts = new ArrayList<String>();
        for (TextFragment fragment : frags) {
            if ((fragment != null) && (fragment.getScore() > 0)) {
                fragTexts.add(fragment.toString());
            }
            if (fragTexts.size() >= numFragments)
                break;
        }
        summaries = (String[]) fragTexts.toArray();
        if (summaries.length > 0)
            docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
        alternateField(docSummaries, params, doc, fieldName);
    }
}

From source file:org.apache.solr.handler.component.AlfrescoSolrHighlighter.java

License:Open Source License

/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query,
        IndexReader reader, SolrQueryRequest req) throws IOException {
    final SolrParams params = req.getParams();
    final String fieldName = schemaField.getName();

    final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
            (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);

    // Technically this is the max *fragments* (snippets), not max values:
    int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
    if (mvToExamine <= 0 || mvToMatch <= 0) {
        return null;
    }//  w ww. jav  a  2 s. c o m

    int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
    if (maxCharsToAnalyze < 0) {//e.g. -1
        maxCharsToAnalyze = Integer.MAX_VALUE;
    }

    List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
    if (fieldValues.isEmpty()) {
        return null;
    }

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    List<TextFragment> frags = new ArrayList<>();

    //Try term vectors, which is faster
    //  note: offsets are minimally sufficient for this HL.
    final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
    final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields,
            maxCharsToAnalyze - 1);
    //  We need to wrap in OffsetWindowTokenFilter if multi-valued
    final OffsetWindowTokenFilter tvWindowStream;
    if (tvStream != null && fieldValues.size() > 1) {
        tvWindowStream = new OffsetWindowTokenFilter(tvStream);
    } else {
        tvWindowStream = null;
    }

    for (String thisText : fieldValues) {
        if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
            break;
        }

        TokenStream tstream;
        if (tvWindowStream != null) {
            // if we have a multi-valued field with term vectors, then get the next offset window
            tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
        } else if (tvStream != null) {
            tstream = tvStream; // single-valued with term vectors
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schemaField, thisText);
        }

        Highlighter highlighter;
        if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
            // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
            // needs to implement reset() efficiently.

            //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
            //  It should be okay if OffsetLimit won't get applied in this case.
            final TokenStream tempTokenStream;
            if (tstream != tvStream) {
                if (maxCharsToAnalyze >= thisText.length()) {
                    tempTokenStream = new CachingTokenFilter(tstream);
                } else {
                    tempTokenStream = new CachingTokenFilter(
                            new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
                }
            } else {
                tempTokenStream = tstream;
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);

            // if the CachingTokenFilter was consumed then use it going forward.
            if (tempTokenStream instanceof CachingTokenFilter
                    && ((CachingTokenFilter) tempTokenStream).isCached()) {
                tstream = tempTokenStream;
            }
            //tstream.reset(); not needed; getBestTextFragments will reset it.
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        maxCharsToAnalyze -= thisText.length();

        // Highlight!
        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream,
                    fixLocalisedText(thisText), mergeContiguousFragments, numFragments);
            for (TextFragment bestTextFragment : bestTextFragments) {
                if (bestTextFragment == null)//can happen via mergeContiguousFragments
                    continue;
                // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
                if (bestTextFragment.getScore() > 0 || preserveMulti) {
                    frags.add(bestTextFragment);
                    if (bestTextFragment.getScore() > 0)
                        --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    } //end field value loop

    // Put the fragments onto the Solr response (docSummaries)
    if (frags.size() > 0) {
        // sort such that the fragments with the highest score come first
        if (!preserveMulti) {
            Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
        }

        // Truncate list to hl.snippets, but not when hl.preserveMulti
        if (frags.size() > numFragments && !preserveMulti) {
            frags = frags.subList(0, numFragments);
        }
        return getResponseForFragments(frags, req);
    }
    return null;//no highlights for this field
}

From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java

License:Apache License

private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId,
        Document doc, String fieldName) throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField)))
        return;/* w  w  w .j  a  v  a 2s.c o m*/
    // END: Hack

    SolrParams params = req.getParams();

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    List<IndexableField> allFields = doc.getFields();
    if (allFields != null && allFields.size() == 0)
        return; // No explicit contract that getFields returns != null,
    // although currently it can't.

    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
    if (tvStream != null) {
        tots = new TermOffsetsTokenStream(tvStream);
    }
    int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
            Integer.toString(Integer.MAX_VALUE)));
    int mvToMatch = Integer.parseInt(
            req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.toString(Integer.MAX_VALUE)));

    for (IndexableField thisField : allFields) {
        if (mvToExamine <= 0 || mvToMatch <= 0)
            break;

        if (!thisField.name().equals(fieldName))
            continue; // Is there a better way to do this?

        --mvToExamine;
        String thisText = thisField.stringValue();

        if (tots != null) {
            // if we're using TermOffsets optimization, then get the next
            // field value's TokenStream (i.e. get field j's TokenStream) from tots:
            tstream = tots.getMultiValuedTokenStream(thisText.length());
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schema, fieldName, thisText);
        }

        int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

        Highlighter highlighter;
        if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
            if (maxCharsToAnalyze < 0) {
                tstream = new CachingTokenFilter(tstream);
            } else {
                tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

            // after highlighter initialization, reset tstream since construction of highlighter already used it
            tstream.reset();
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        if (maxCharsToAnalyze < 0) {
            highlighter.setMaxDocCharsToAnalyze(thisText.length());
        } else {
            highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        }

        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText,
                    mergeContiguousFragments, numFragments);
            for (int k = 0; k < bestTextFragments.length; k++) {
                if (preserveMulti) {
                    if (bestTextFragments[k] != null) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                } else {
                    if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // sort such that the fragments with the highest score come first
    if (!preserveMulti) {
        Collections.sort(frags, new Comparator<TextFragment>() {
            @Override
            public int compare(TextFragment arg0, TextFragment arg1) {
                return Math.round(arg1.getScore() - arg0.getScore());
            }
        });
    }

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
        ArrayList<String> fragTexts = new ArrayList<String>();
        for (TextFragment fragment : frags) {
            if (preserveMulti) {
                if (fragment != null) {
                    fragTexts.add(fragment.toString());
                }
            } else {
                if ((fragment != null) && (fragment.getScore() > 0)) {
                    fragTexts.add(fragment.toString());
                }
            }

            if (fragTexts.size() >= numFragments && !preserveMulti)
                break;
        }
        summaries = fragTexts.toArray(new String[0]);
        if (summaries.length > 0)
            docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
        alternateField(docSummaries, params, doc, fieldName);
    }
}