List of usage examples for org.apache.lucene.search.highlight TokenSources getTermVectorTokenStreamOrNull
public static TokenStream getTermVectorTokenStreamOrNull(String field, Fields tvFields, int maxStartOffset) throws IOException
From source file:org.apache.solr.handler.component.AlfrescoSolrHighlighter.java
License:Open Source License
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */ @SuppressWarnings("unchecked") protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException { final SolrParams params = req.getParams(); final String fieldName = schemaField.getName(); final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1); // Technically this is the max *fragments* (snippets), not max values: int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE); if (mvToExamine <= 0 || mvToMatch <= 0) { return null; }/*w ww. ja va 2s. c o m*/ int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS); if (maxCharsToAnalyze < 0) {//e.g. -1 maxCharsToAnalyze = Integer.MAX_VALUE; } List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req); if (fieldValues.isEmpty()) { return null; } // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); List<TextFragment> frags = new ArrayList<>(); //Try term vectors, which is faster // note: offsets are minimally sufficient for this HL. final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null; final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1); // We need to wrap in OffsetWindowTokenFilter if multi-valued final OffsetWindowTokenFilter tvWindowStream; if (tvStream != null && fieldValues.size() > 1) { tvWindowStream = new OffsetWindowTokenFilter(tvStream); } else { tvWindowStream = null; } for (String thisText : fieldValues) { if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) { break; } TokenStream tstream; if (tvWindowStream != null) { // if we have a multi-valued field with term vectors, then get the next offset window tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length()); } else if (tvStream != null) { tstream = tvStream; // single-valued with term vectors } else { // fall back to analyzer tstream = createAnalyzerTStream(schemaField, thisText); } Highlighter highlighter; if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) { // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream // needs to implement reset() efficiently. //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary. // It should be okay if OffsetLimit won't get applied in this case. final TokenStream tempTokenStream; if (tstream != tvStream) { if (maxCharsToAnalyze >= thisText.length()) { tempTokenStream = new CachingTokenFilter(tstream); } else { tempTokenStream = new CachingTokenFilter( new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } } else { tempTokenStream = tstream; } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream); // if the CachingTokenFilter was consumed then use it going forward. if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) { tstream = tempTokenStream; } //tstream.reset(); not needed; getBestTextFragments will reset it. } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); maxCharsToAnalyze -= thisText.length(); // Highlight! try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, fixLocalisedText(thisText), mergeContiguousFragments, numFragments); for (TextFragment bestTextFragment : bestTextFragments) { if (bestTextFragment == null)//can happen via mergeContiguousFragments continue; // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless. if (bestTextFragment.getScore() > 0 || preserveMulti) { frags.add(bestTextFragment); if (bestTextFragment.getScore() > 0) --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } //end field value loop // Put the fragments onto the Solr response (docSummaries) if (frags.size() > 0) { // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore())); } // Truncate list to hl.snippets, but not when hl.preserveMulti if (frags.size() > numFragments && !preserveMulti) { frags = frags.subList(0, numFragments); } return getResponseForFragments(frags, req); } return null;//no highlights for this field }