List of usage examples for org.apache.lucene.search.highlight TextFragment getScore
public float getScore()
From source file:com.o19s.solr.swan.highlight.SwanHighlighter.java
License:Apache License
private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { final SolrIndexSearcher searcher = req.getSearcher(); final IndexSchema schema = searcher.getSchema(); // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - // so we disable them until fixed (see LUCENE-3080)! // BEGIN: Hack final SchemaField schemaField = schema.getFieldOrNull(fieldName); if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField) || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return;/*from w w w . j a va 2s .c om*/ // END: Hack SolrParams params = req.getParams(); IndexableField[] docFields = doc.getFields(fieldName); List<String> listFields = new ArrayList<String>(); for (IndexableField field : docFields) { listFields.add(field.stringValue()); } String[] docTexts = listFields.toArray(new String[listFields.size()]); // according to Document javadoc, doc.getValues() never returns null. check empty instead of null if (docTexts.length == 0) return; TokenStream tokenStream; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization try { // TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName); // if (tvStream != null) { // tots = new TermOffsetsTokenStream(tvStream); // } } catch (IllegalArgumentException e) { // No problem. But we can't use TermOffsets optimization. } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tokenStream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tokenStream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { if (maxCharsToAnalyze < 0) { tokenStream = new CachingTokenFilter(tokenStream); } else { tokenStream = new CachingTokenFilter( new OffsetLimitTokenFilter(tokenStream, maxCharsToAnalyze)); } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tokenStream); // after highlighter initialization, reset tstream since construction of highlighter already used it tokenStream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tokenStream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first Collections.sort(frags, new Comparator<TextFragment>() { public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes String[] summaries = null; if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } if (fragTexts.size() >= numFragments) break; } summaries = (String[]) fragTexts.toArray(); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
From source file:com.tripod.lucene.service.AbstractLuceneService.java
License:Apache License
/** * Performs highlighting for a given query and a given document. * * @param indexSearcher the IndexSearcher performing the query * @param query the Tripod LuceneQuery//w w w. j a v a2 s . c o m * @param scoreDoc the Lucene ScoreDoc * @param doc the Lucene Document * @param highlighter the Highlighter to use * @param result the QueryResult to add the highlights to * @throws IOException if an error occurs performing the highlighting * @throws InvalidTokenOffsetsException if an error occurs performing the highlighting */ protected void performHighlighting(final IndexSearcher indexSearcher, final Q query, final ScoreDoc scoreDoc, final Document doc, final Highlighter highlighter, final QR result) throws IOException, InvalidTokenOffsetsException { if (query.getHighlightFields() == null || query.getHighlightFields().isEmpty()) { return; } final List<Highlight> highlights = new ArrayList<>(); final List<String> hlFieldNames = getHighlightFieldNames(query, doc); // process each field to highlight on for (String hlField : hlFieldNames) { final String text = doc.get(hlField); if (StringUtils.isEmpty(text)) { continue; } final List<String> snippets = new ArrayList<>(); final Fields tvFields = indexSearcher.getIndexReader().getTermVectors(scoreDoc.doc); final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1; // get the snippets for the given field final TokenStream tokenStream = TokenSources.getTokenStream(hlField, tvFields, text, analyzer, maxStartOffset); final TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, false, 10); for (TextFragment textFragment : textFragments) { if (textFragment != null && textFragment.getScore() > 0) { snippets.add(textFragment.toString()); } } // if we have snippets then add a highlight result to the QueryResult if (snippets.size() > 0) { highlights.add(new Highlight(hlField, snippets)); } } result.setHighlights(highlights); }
From source file:Example.lucene.SearchNHilight.java
public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException { //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); String querystr = args.length > 0 ? args[0] : "golf user"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr);// w ww . j av a2 s. com // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); String Preview; for (int i = 0; i < 10; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String text; Preview = ""; System.out.println(doc.get("url")); System.out.println(doc.get("title")); text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); int k = 0; for (TextFragment frag1 : frag) { if ((frag1 != null) && (frag1.getScore() > 0)) { Preview += (frag1.toString()) + "...<br>"; k++; // Get 2 Line Preview if (k >= 2) break; } } //Term vector System.out.println("-------------"); } }
From source file:Main.WebAPI.Search.java
/** * //from w w w . j ava 2 s .co m * @param args args[0] is a query * * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */ public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException { //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); String querystr = args.length > 0 ? args[0] : "mike lab"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr); // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); String Preview; for (int i = 0; i < 10; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String text; Preview = ""; System.out.println(doc.get("url")); System.out.println(doc.get("title")); text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); int k = 0; for (TextFragment frag1 : frag) { if ((frag1 != null) && (frag1.getScore() > 0)) { Preview += (frag1.toString()) + "...<br>"; k++; // Get 2 Line Preview if (k >= 2) break; } } //Term vector System.out.println("-------------"); } }
From source file:net.riezebos.thoth.content.search.Searcher.java
License:Apache License
public PagedList<SearchResult> search(Identity identity, String queryExpression, int pageNumber, int pageSize) throws SearchException { try {/*from www. j ava 2s .c o m*/ IndexReader reader = getIndexReader(contentManager); IndexSearcher searcher = getIndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); // We might need to restrict the results to books of the user does not have access to fragments: AccessManager accessManager = contentManager.getAccessManager(); boolean booksOnly = !accessManager.hasPermission(identity, "", Permission.READ_FRAGMENTS); if (booksOnly) { queryExpression = Indexer.INDEX_TYPE + ":" + Indexer.TYPE_DOCUMENT + " AND (" + queryExpression + ")"; } QueryParser parser = new QueryParser(Indexer.INDEX_CONTENTS, analyzer); Query query = parser.parse(queryExpression); // We add 1 to determine if there is more to be found after the current page int maxResults = pageSize * pageNumber + 1; TopDocs results = searcher.search(query, maxResults, Sort.RELEVANCE); ScoreDoc[] hits = results.scoreDocs; boolean hadMore = (hits.length == maxResults); List<SearchResult> searchResults = new ArrayList<>(); int idx = 0; for (ScoreDoc scoreDoc : hits) { if (searchResults.size() == pageSize) break; idx++; if (idx >= (pageNumber - 1) * pageSize) { Document document = searcher.doc(scoreDoc.doc); IndexableField field = document.getField(Indexer.INDEX_PATH); String documentPath = field.stringValue(); SearchResult searchResult = new SearchResult(); searchResult.setIndexNumber((pageNumber - 1) * pageSize + idx); searchResult.setDocument(documentPath); String type = document.get(Indexer.INDEX_TYPE); if (Indexer.TYPE_DOCUMENT.equals(type) || Indexer.TYPE_FRAGMENT.equals(type)) { searchResult.setResource(false); try { MarkDownDocument markDownDocument = contentManager.getMarkDownDocument(documentPath, true, CriticProcessingMode.DO_NOTHING); String contents = markDownDocument.getMarkdown(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query, Indexer.INDEX_CONTENTS)); highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); TokenStream tokenStream = analyzer.tokenStream(Indexer.INDEX_CONTENTS, contents); TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, contents, false, 99999); for (TextFragment frag : frags) { if ((frag != null) && (frag.getScore() > 0)) { String fragmentText = frag.toString(); searchResult.addFragment( new Fragment(ThothCoreUtil.escapeHtmlExcept("B", fragmentText))); } } } catch (FileNotFoundException e) { LOG.warn( "Index contains an invalid file reference); probably need to reindex to get rid of this. File: " + e.getMessage()); } } else { searchResult.setResource(true); String extension = ThothUtil.getExtension(documentPath); searchResult.setImage(getConfiguration().isImageExtension(extension)); searchResult.addFragment(new Fragment(document.get(Indexer.INDEX_TITLE))); } searchResults.add(searchResult); } } reader.close(); linkBooks(searchResults); PagedList<SearchResult> pagedList = new PagedList<>(searchResults, hadMore); return pagedList; } catch (Exception e) { throw new SearchException(e); } }
From source file:org.apache.solr.handler.component.AlfrescoSolrHighlighter.java
License:Open Source License
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */ @SuppressWarnings("unchecked") protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException { final SolrParams params = req.getParams(); final String fieldName = schemaField.getName(); final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1); // Technically this is the max *fragments* (snippets), not max values: int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE); if (mvToExamine <= 0 || mvToMatch <= 0) { return null; }// ww w.j ava 2s. c o m int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS); if (maxCharsToAnalyze < 0) {//e.g. -1 maxCharsToAnalyze = Integer.MAX_VALUE; } List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req); if (fieldValues.isEmpty()) { return null; } // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); List<TextFragment> frags = new ArrayList<>(); //Try term vectors, which is faster // note: offsets are minimally sufficient for this HL. final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null; final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1); // We need to wrap in OffsetWindowTokenFilter if multi-valued final OffsetWindowTokenFilter tvWindowStream; if (tvStream != null && fieldValues.size() > 1) { tvWindowStream = new OffsetWindowTokenFilter(tvStream); } else { tvWindowStream = null; } for (String thisText : fieldValues) { if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) { break; } TokenStream tstream; if (tvWindowStream != null) { // if we have a multi-valued field with term vectors, then get the next offset window tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length()); } else if (tvStream != null) { tstream = tvStream; // single-valued with term vectors } else { // fall back to analyzer tstream = createAnalyzerTStream(schemaField, thisText); } Highlighter highlighter; if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) { // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream // needs to implement reset() efficiently. //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary. // It should be okay if OffsetLimit won't get applied in this case. final TokenStream tempTokenStream; if (tstream != tvStream) { if (maxCharsToAnalyze >= thisText.length()) { tempTokenStream = new CachingTokenFilter(tstream); } else { tempTokenStream = new CachingTokenFilter( new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } } else { tempTokenStream = tstream; } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream); // if the CachingTokenFilter was consumed then use it going forward. if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) { tstream = tempTokenStream; } //tstream.reset(); not needed; getBestTextFragments will reset it. } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); maxCharsToAnalyze -= thisText.length(); // Highlight! try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, fixLocalisedText(thisText), mergeContiguousFragments, numFragments); for (TextFragment bestTextFragment : bestTextFragments) { if (bestTextFragment == null)//can happen via mergeContiguousFragments continue; // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless. if (bestTextFragment.getScore() > 0 || preserveMulti) { frags.add(bestTextFragment); if (bestTextFragment.getScore() > 0) --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } //end field value loop // Put the fragments onto the Solr response (docSummaries) if (frags.size() > 0) { // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore())); } // Truncate list to hl.snippets, but not when hl.preserveMulti if (frags.size() > numFragments && !preserveMulti) { frags = frags.subList(0, numFragments); } return getResponseForFragments(frags, req); } return null;//no highlights for this field }
From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java
License:Apache License
private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { final SolrIndexSearcher searcher = req.getSearcher(); final IndexSchema schema = searcher.getSchema(); // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - // so we disable them until fixed (see LUCENE-3080)! // BEGIN: Hack final SchemaField schemaField = schema.getFieldOrNull(fieldName); if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField) || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return;// www . j ava2 s . co m // END: Hack SolrParams params = req.getParams(); // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); List<IndexableField> allFields = doc.getFields(); if (allFields != null && allFields.size() == 0) return; // No explicit contract that getFields returns != null, // although currently it can't. TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, Integer.toString(Integer.MAX_VALUE))); int mvToMatch = Integer.parseInt( req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.toString(Integer.MAX_VALUE))); for (IndexableField thisField : allFields) { if (mvToExamine <= 0 || mvToMatch <= 0) break; if (!thisField.name().equals(fieldName)) continue; // Is there a better way to do this? --mvToExamine; String thisText = thisField.stringValue(); if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(thisText.length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, thisText); } int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { if (maxCharsToAnalyze < 0) { tstream = new CachingTokenFilter(tstream); } else { tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(thisText.length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if (preserveMulti) { if (bestTextFragments[k] != null) { frags.add(bestTextFragments[k]); --mvToMatch; } } else { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); --mvToMatch; } } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort(frags, new Comparator<TextFragment>() { @Override public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); } // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if (preserveMulti) { if (fragment != null) { fragTexts.add(fragment.toString()); } } else { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } } if (fragTexts.size() >= numFragments && !preserveMulti) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
From source file:org.apache.solr.highlight.ParsedContentSolrHighlighter.java
License:Apache License
/** * Generates a list of Highlighted query fragments for each item in a list * of documents, or returns null if highlighting is disabled. * //www .j a v a 2s . com * @param docs * query results * @param query * the query * @param req * the current request * @param defaultFields * default list of fields to summarize * @return NamedList containing a NamedList for each document, which in * turns contains sets (field, summary) pairs. */ @SuppressWarnings("unchecked") public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException { SolrParams params = req.getParams(); if (!isHighlightingEnabled(params)) return null; SolrIndexSearcher searcher = req.getSearcher(); IndexSchema schema = searcher.getSchema(); NamedList fragments = new SimpleOrderedMap(); String[] fieldNames = getHighlightFields(query, req, defaultFields); Document[] readDocs = new Document[docs.size()]; { // pre-fetch documents using the Searcher's doc cache Set<String> fset = new HashSet<String>(); for (String f : fieldNames) { fset.add(f); } // fetch unique key if one exists. SchemaField keyField = schema.getUniqueKeyField(); if (null != keyField) fset.add(keyField.getName()); searcher.readDocs(readDocs, docs, fset); } // Highlight each document DocIterator iterator = docs.iterator(); for (int i = 0; i < docs.size(); i++) { int docId = iterator.nextDoc(); Document doc = readDocs[i]; NamedList docSummaries = new SimpleOrderedMap(); for (String fieldName : fieldNames) { fieldName = fieldName.trim(); // begin String[] docTexts = doc.getValues(fieldName); //Highlight only the parsed content, instead of all fields if (IndexField.DEFAULT_SEARCH_FIELD.equals(fieldName)) { docTexts = doc.getValues(IndexField.PARSED_CONTENT_FIELD); } // IndexFieldServices indexFieldServices = ConstellioSpringUtils.getIndexFieldServices(); // String collectionName = params.get(ConstellioSolrQueryParams.COLLECTION_NAME); // RecordCollectionServices collectionServices = ConstellioSpringUtils.getRecordCollectionServices(); // RecordCollection collection = collectionServices.get(collectionName); // IndexField defaultSearchField = collection.getDefaultSearchIndexField(); // // List<String> defaultSearchFieldDocTextsList = new ArrayList<String>(); // for (CopyField copyField : defaultSearchField.getCopyFieldsDest()) { // IndexField sourceIndexField = copyField.getIndexFieldSource(); // if (sourceIndexField != null) { // String sourceIndexFieldName = sourceIndexField.getName(); // String[] copyFieldValues = doc.getValues(sourceIndexFieldName); // if (copyFieldValues != null) { // for (int k = 0; k < copyFieldValues.length; k++) { // String copyFieldValue = copyFieldValues[k]; // if (!defaultSearchFieldDocTextsList.contains(copyFieldValue)) { // defaultSearchFieldDocTextsList.add(copyFieldValue); // } // } // } // } // } // docTexts = defaultSearchFieldDocTextsList.toArray(new String[0]); // if ((docTexts == null || docTexts.length == 0)) { // RecordServices recordServices = ConstellioSpringUtils.getRecordServices(); // Long recordId = new Long(doc.getField(IndexField.RECORD_ID_FIELD).stringValue()); // Record record; // try { // record = recordServices.get(recordId, collection); // } catch (Exception e) { // record = null; // e.printStackTrace(); // } // if (record != null) { // List<Object> fieldValues = indexFieldServices.extractFieldValues(record, defaultSearchField); // // List<String> docTextsList = new ArrayList<String>(); // for (Object fieldValue : fieldValues) { // String strFieldValue = fieldValue != null ? fieldValue.toString() : null; // if (StringUtils.isNotBlank(strFieldValue)) { // docTextsList.add(strFieldValue); // } // } // // if (!docTextsList.isEmpty()) { // docTexts = docTextsList.toArray(new String[0]); // } // } // } // // end if (docTexts == null) continue; TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); for (int j = 0; j < docTexts.length; j++) { // create TokenStream try { // attempt term vectors tstream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); } catch (IllegalArgumentException e) { // fall back to anaylzer tstream = new TokenOrderingFilter( schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10); } Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) { // wrap CachingTokenFilter around TokenStream for reuse tstream = new CachingTokenFilter(tstream); // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter // already used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } catch (InvalidTokenOffsetsException e) { throw new RuntimeException(e); } } // sort such that the fragments with the highest score come first Collections.sort(frags, new Comparator<TextFragment>() { public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if ((fragment != null) && (fragment.getScore() > 0)) { // fragTexts.add(fragment.toString()); fragTexts.add(StringEscapeUtils.escapeHtml(fragment.toString())); } if (fragTexts.size() >= numFragments) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { String alternateField = req.getParams().getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD); if (alternateField != null && alternateField.length() > 0) { String[] altTexts = doc.getValues(alternateField); if (altTexts != null && altTexts.length > 0) { int alternateFieldLen = req.getParams().getFieldInt(fieldName, HighlightParams.ALTERNATE_FIELD_LENGTH, 0); if (alternateFieldLen <= 0) { docSummaries.add(fieldName, altTexts); } else { List<String> altList = new ArrayList<String>(); int len = 0; for (String altText : altTexts) { altList.add(len + altText.length() > alternateFieldLen ? altText.substring(0, alternateFieldLen - len) : altText); len += altText.length(); if (len >= alternateFieldLen) break; } docSummaries.add(fieldName, altList); } } } } } String printId = schema.printableUniqueKey(doc); fragments.add(printId == null ? null : printId, docSummaries); } return fragments; }
From source file:org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter.java
License:Apache License
@Override public HighlightField highlight(HighlighterContext highlighterContext) { SearchContextHighlight.Field field = highlighterContext.field; SearchContext context = highlighterContext.context; FetchSubPhase.HitContext hitContext = highlighterContext.hitContext; FieldMapper mapper = highlighterContext.mapper; Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; if (!hitContext.cache().containsKey(CACHE_KEY)) { Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>(); hitContext.cache().put(CACHE_KEY, mappers); }//from w ww .j a v a 2 s.c o m @SuppressWarnings("unchecked") Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext .cache().get(CACHE_KEY); org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper); if (entry == null) { QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query, field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null); queryScorer.setExpandMultiTermQuery(true); Fragmenter fragmenter; if (field.fieldOptions().numberOfFragments() == 0) { fragmenter = new NullFragmenter(); } else if (field.fieldOptions().fragmenter() == null) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else if ("simple".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize()); } else if ("span".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else { throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]"); } Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]); entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer); entry.setTextFragmenter(fragmenter); // always highlight across all data entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); cache.put(mapper, entry); } // a HACK to make highlighter do highlighting, even though its using the single frag list builder int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments(); ArrayList<TextFragment> fragsList = new ArrayList<>(); List<Object> textsToHighlight; Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers() .indexAnalyzer(); try { textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext); for (Object textToHighlight : textsToHighlight) { String text = textToHighlight.toString(); try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) { if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) { // can't perform highlighting if the stream has no terms (binary token stream) or no offsets continue; } TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments); for (TextFragment bestTextFragment : bestTextFragments) { if (bestTextFragment != null && bestTextFragment.getScore() > 0) { fragsList.add(bestTextFragment); } } } } } catch (Exception e) { if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) { // this can happen if for example a field is not_analyzed and ignore_above option is set. // the field will be ignored when indexing but the huge term is still in the source and // the plain highlighter will parse the source and try to analyze it. return null; } else { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } } if (field.fieldOptions().scoreOrdered()) { CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() { @Override public int compare(TextFragment o1, TextFragment o2) { return Math.round(o2.getScore() - o1.getScore()); } }); } String[] fragments; // number_of_fragments is set to 0 but we have a multivalued field if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) { fragments = new String[fragsList.size()]; for (int i = 0; i < fragsList.size(); i++) { fragments[i] = fragsList.get(i).toString(); } } else { // refine numberOfFragments if needed numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments; fragments = new String[numberOfFragments]; for (int i = 0; i < fragments.length; i++) { fragments[i] = fragsList.get(i).toString(); } } if (fragments.length > 0) { return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments)); } int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize(); if (noMatchSize > 0 && textsToHighlight.size() > 0) { // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary. String fieldContents = textsToHighlight.get(0).toString(); int end; try { end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(), fieldContents); } catch (Exception e) { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } if (end > 0) { return new HighlightField(highlighterContext.fieldName, new Text[] { new Text(fieldContents.substring(0, end)) }); } } return null; }
From source file:org.elasticsearch.search.highlight.PlainHighlighter.java
License:Apache License
public HighlightField highlight(HighlighterContext highlighterContext) { SearchContextHighlight.Field field = highlighterContext.field; SearchContext context = highlighterContext.context; FetchSubPhase.HitContext hitContext = highlighterContext.hitContext; FieldMapper<?> mapper = highlighterContext.mapper; Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; if (!hitContext.cache().containsKey(CACHE_KEY)) { Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter> mappers = Maps.newHashMap(); hitContext.cache().put(CACHE_KEY, mappers); }/* ww w . j a v a 2s . c o m*/ @SuppressWarnings("unchecked") Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter>) hitContext .cache().get(CACHE_KEY); org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper); if (entry == null) { Query query = highlighterContext.query.originalQuery(); QueryScorer queryScorer = new CustomQueryScorer(query, field.fieldOptions().requireFieldMatch() ? mapper.names().indexName() : null); queryScorer.setExpandMultiTermQuery(true); Fragmenter fragmenter; if (field.fieldOptions().numberOfFragments() == 0) { fragmenter = new NullFragmenter(); } else if (field.fieldOptions().fragmenter() == null) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else if ("simple".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize()); } else if ("span".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else { throw new ElasticsearchIllegalArgumentException( "unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]"); } Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]); entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer); entry.setTextFragmenter(fragmenter); // always highlight across all data entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); cache.put(mapper, entry); } // a HACK to make highlighter do highlighting, even though its using the single frag list builder int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments(); ArrayList<TextFragment> fragsList = new ArrayList<TextFragment>(); List<Object> textsToHighlight; try { textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext); for (Object textToHighlight : textsToHighlight) { String text = textToHighlight.toString(); Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers() .indexAnalyzer(); TokenStream tokenStream = analyzer.tokenStream(mapper.names().indexName(), text); if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) { // can't perform highlighting if the stream has no terms (binary token stream) or no offsets continue; } TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments); for (TextFragment bestTextFragment : bestTextFragments) { if (bestTextFragment != null && bestTextFragment.getScore() > 0) { fragsList.add(bestTextFragment); } } } } catch (Exception e) { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } if (field.fieldOptions().scoreOrdered()) { CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() { public int compare(TextFragment o1, TextFragment o2) { return Math.round(o2.getScore() - o1.getScore()); } }); } String[] fragments; // number_of_fragments is set to 0 but we have a multivalued field if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) { fragments = new String[fragsList.size()]; for (int i = 0; i < fragsList.size(); i++) { fragments[i] = fragsList.get(i).toString(); } } else { // refine numberOfFragments if needed numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments; fragments = new String[numberOfFragments]; for (int i = 0; i < fragments.length; i++) { fragments[i] = fragsList.get(i).toString(); } } if (fragments.length > 0) { return new HighlightField(highlighterContext.fieldName, StringText.convertFromStringArray(fragments)); } int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize(); if (noMatchSize > 0 && textsToHighlight.size() > 0) { // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary. String fieldContents = textsToHighlight.get(0).toString(); Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers() .indexAnalyzer(); int end; try { end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer.tokenStream(mapper.names().indexName(), fieldContents)); } catch (Exception e) { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } if (end > 0) { return new HighlightField(highlighterContext.fieldName, new Text[] { new StringText(fieldContents.substring(0, end)) }); } } return null; }