Example usage for org.apache.lucene.search.highlight TokenSources getTokenStream

List of usage examples for org.apache.lucene.search.highlight TokenSources getTokenStream

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight TokenSources getTokenStream.

Prototype

public static TokenStream getTokenStream(String field, Fields tvFields, String text, Analyzer analyzer,
        int maxStartOffset) throws IOException 

Source Link

Document

Get a token stream from either un-inverting a term vector if possible, or by analyzing the text.

Usage

From source file:com.aurel.track.lucene.search.LuceneSearcher.java

License:Open Source License

private static int[] getQueryResults(Query query, String userQueryString, String preprocessedQueryString,
        Map<Integer, String> highlightedTextMap) {
    int[] hitIDs = new int[0];
    IndexSearcher indexSearcher = null;/*from www .  j av  a  2  s . c o m*/
    try {
        long start = 0;
        if (LOGGER.isDebugEnabled()) {
            start = new Date().getTime();
        }
        indexSearcher = getIndexSearcher(LuceneUtil.INDEXES.WORKITEM_INDEX);
        if (indexSearcher == null) {
            return hitIDs;
        }
        ScoreDoc[] scoreDocs;
        try {
            TopDocsCollector<ScoreDoc> collector = TopScoreDocCollector.create(MAXIMAL_HITS);
            indexSearcher.search(query, collector);
            scoreDocs = collector.topDocs().scoreDocs;
        } catch (IOException e) {
            LOGGER.warn("Getting the workitem search results failed with failed with " + e.getMessage());
            LOGGER.debug(ExceptionUtils.getStackTrace(e));
            return hitIDs;
        }
        if (LOGGER.isDebugEnabled()) {
            long end = new Date().getTime();
            LOGGER.debug("Found " + scoreDocs.length + " document(s) (in " + (end - start)
                    + " milliseconds) that matched the user query '" + userQueryString
                    + "' the preprocessed query '" + preprocessedQueryString + "' and the query.toString() '"
                    + query.toString() + "'");
        }
        QueryScorer queryScorer = new QueryScorer(query/*, LuceneUtil.HIGHLIGHTER_FIELD*/);
        Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        Highlighter highlighter = new Highlighter(queryScorer); // Set the best scorer fragments
        highlighter.setTextFragmenter(fragmenter); // Set fragment to highlight
        hitIDs = new int[scoreDocs.length];
        for (int i = 0; i < scoreDocs.length; i++) {
            int docID = scoreDocs[i].doc;
            Document doc = null;
            try {
                doc = indexSearcher.doc(docID);
            } catch (IOException e) {
                LOGGER.error("Getting the workitem documents failed with " + e.getMessage());
                LOGGER.debug(ExceptionUtils.getStackTrace(e));
            }
            if (doc != null) {
                Integer itemID = Integer.valueOf(doc.get(LuceneUtil.getFieldName(SystemFields.ISSUENO)));
                if (itemID != null) {
                    hitIDs[i] = itemID.intValue();
                    if (highlightedTextMap != null) {
                        String highligherFieldValue = doc.get(LuceneUtil.HIGHLIGHTER_FIELD);
                        TokenStream tokenStream = null;
                        try {
                            tokenStream = TokenSources.getTokenStream(LuceneUtil.HIGHLIGHTER_FIELD, null,
                                    highligherFieldValue, LuceneUtil.getAnalyzer(), -1);
                        } catch (Exception ex) {
                            LOGGER.debug(ex.getMessage());
                        }
                        if (tokenStream != null) {
                            String fragment = highlighter.getBestFragment(tokenStream, highligherFieldValue);
                            if (fragment != null) {
                                highlightedTextMap.put(itemID, fragment);
                            }
                        }
                    }
                }
            }
        }
        return hitIDs;
    } catch (BooleanQuery.TooManyClauses e) {
        LOGGER.error("Searching the query resulted in too many clauses. Try to narrow the query results. "
                + e.getMessage());
        LOGGER.debug(ExceptionUtils.getStackTrace(e));
        throw e;
    } catch (Exception e) {
        LOGGER.error("Searching the workitems failed with " + e.getMessage());
        LOGGER.debug(ExceptionUtils.getStackTrace(e));
        return hitIDs;
    } finally {
        closeIndexSearcherAndUnderlyingIndexReader(indexSearcher, "workItem");
    }
}

From source file:com.difference.historybook.index.lucene.LuceneIndex.java

License:Apache License

@Override
public SearchResultWrapper search(String collection, String query, int offset, int size, boolean includeDebug)
        throws IndexException {
    try {/*www .ja va  2 s  .  com*/
        //TODO: make age be a component in the ranking?
        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
        queryBuilder.add(parser.parse(query), Occur.MUST);
        queryBuilder.add(new TermQuery(new Term(IndexDocumentAdapter.FIELD_COLLECTION, collection)),
                Occur.FILTER);
        Query baseQuery = queryBuilder.build();

        FunctionQuery boostQuery = new FunctionQuery(
                new ReciprocalFloatFunction(new DurationValueSource(new Date().getTime() / 1000,
                        new LongFieldSource(IndexDocumentAdapter.FIELD_TIMESTAMP)), RECIP, 1F, 1F));

        Query q = new CustomScoreQuery(baseQuery, boostQuery);

        QueryScorer queryScorer = new QueryScorer(q, IndexDocumentAdapter.FIELD_SEARCH);
        Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        Highlighter highlighter = new Highlighter(queryScorer);
        highlighter.setTextFragmenter(fragmenter);

        GroupingSearch gsearch = new GroupingSearch(IndexDocumentAdapter.FIELD_URL_GROUP).setGroupDocsLimit(1)
                .setAllGroups(true).setIncludeMaxScore(true);
        TopGroups<?> groups = gsearch.search(searcher, q, offset, size);

        ArrayList<SearchResult> results = new ArrayList<>(size);
        for (int i = offset; i < offset + size && i < groups.groups.length; i++) {
            ScoreDoc scoreDoc = groups.groups[i].scoreDocs[0];
            Document luceneDoc = searcher.doc(scoreDoc.doc);
            IndexDocumentAdapter doc = new IndexDocumentAdapter(luceneDoc);

            TokenStream tokenStream = TokenSources.getTokenStream(IndexDocumentAdapter.FIELD_SEARCH,
                    reader.getTermVectors(scoreDoc.doc), luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH),
                    analyzer, highlighter.getMaxDocCharsToAnalyze() - 1);

            String[] snippets = highlighter.getBestFragments(tokenStream,
                    luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH), 3);
            String snippet = Arrays.asList(snippets).stream().collect(Collectors.joining("\n"));
            snippet = Jsoup.clean(snippet, Whitelist.simpleText());

            String debugInfo = null;
            if (includeDebug) {
                Explanation explanation = searcher.explain(q, scoreDoc.doc);
                debugInfo = explanation.toString();
            }

            results.add(new SearchResult(doc.getKey(), doc.getCollection(), doc.getTitle(), doc.getUrl(),
                    doc.getDomain(), doc.getTimestampText(), snippet, debugInfo, scoreDoc.score));
        }

        SearchResultWrapper wrapper = new SearchResultWrapper().setQuery(query).setOffset(offset)
                .setMaxResultsRequested(size)
                .setResultCount(groups.totalGroupCount != null ? groups.totalGroupCount : 0)
                .setResults(results);

        if (includeDebug) {
            wrapper.setDebugInfo(q.toString());
        }

        return wrapper;

    } catch (IOException | ParseException | InvalidTokenOffsetsException e) {
        LOG.error(e.getLocalizedMessage());
        throw new IndexException(e);
    }
}

From source file:com.rapidminer.search.GlobalSearchHandler.java

License:Open Source License

/**
 * Creates the search result for search methods.
 *
 * @param searchTerm/* w w  w .j  ava 2 s.c  om*/
 *       the search string
 * @param searcher
 *       the index searcher instance which was used to search
 * @param result
 *       the result of the search
 * @param highlightResult
 *       if {@code true}, the {@link GlobalSearchResult#getBestFragments()} will be created
 * @return the search result instance, never {@code null}
 * @throws IOException
 *       if something goes wrong
 */
private GlobalSearchResult createSearchResult(final String searchTerm, final Query parsedQuery,
        final IndexSearcher searcher, final TopDocs result, final boolean highlightResult) throws IOException {
    int resultNumber = result.scoreDocs.length;
    List<Document> resultList = new ArrayList<>(resultNumber);
    List<String[]> highlights = highlightResult ? new LinkedList<>() : null;
    ScoreDoc lastResult = resultNumber > 0 ? result.scoreDocs[result.scoreDocs.length - 1] : null;
    for (ScoreDoc scoreDoc : result.scoreDocs) {
        Document doc = searcher.doc(scoreDoc.doc);
        resultList.add(doc);

        if (highlightResult) {
            // search result highlighting best match on name field
            QueryScorer scorer = new QueryScorer(parsedQuery);
            Highlighter highlighter = new Highlighter(HIGHLIGHT_FORMATTER, scorer);
            Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, FRAGMENT_SIZE);
            highlighter.setTextFragmenter(fragmenter);
            try {
                TokenStream stream = TokenSources.getTokenStream(GlobalSearchUtilities.FIELD_NAME,
                        searcher.getIndexReader().getTermVectors(scoreDoc.doc),
                        doc.get(GlobalSearchUtilities.FIELD_NAME), GlobalSearchUtilities.ANALYZER,
                        Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE - 1);
                if (stream != null) {
                    highlights.add(highlighter.getBestFragments(stream,
                            doc.get(GlobalSearchUtilities.FIELD_NAME), MAX_NUMBER_OF_FRAGMENTS));
                } else {
                    highlights.add(null);
                }
            } catch (InvalidTokenOffsetsException e) {
                highlights.add(null);
            }
        }
    }
    return new GlobalSearchResult(resultList, searchTerm, lastResult, result.totalHits, highlights);
}

From source file:com.tripod.lucene.service.AbstractLuceneService.java

License:Apache License

/**
 * Performs highlighting for a given query and a given document.
 *
 * @param indexSearcher the IndexSearcher performing the query
 * @param query the Tripod LuceneQuery/* w ww  . j  a  v  a2 s . c o  m*/
 * @param scoreDoc the Lucene ScoreDoc
 * @param doc the Lucene Document
 * @param highlighter the Highlighter to use
 * @param result the QueryResult to add the highlights to
 * @throws IOException if an error occurs performing the highlighting
 * @throws InvalidTokenOffsetsException if an error occurs performing the highlighting
 */
protected void performHighlighting(final IndexSearcher indexSearcher, final Q query, final ScoreDoc scoreDoc,
        final Document doc, final Highlighter highlighter, final QR result)
        throws IOException, InvalidTokenOffsetsException {

    if (query.getHighlightFields() == null || query.getHighlightFields().isEmpty()) {
        return;
    }

    final List<Highlight> highlights = new ArrayList<>();
    final List<String> hlFieldNames = getHighlightFieldNames(query, doc);

    // process each field to highlight on
    for (String hlField : hlFieldNames) {
        final String text = doc.get(hlField);
        if (StringUtils.isEmpty(text)) {
            continue;
        }

        final List<String> snippets = new ArrayList<>();
        final Fields tvFields = indexSearcher.getIndexReader().getTermVectors(scoreDoc.doc);
        final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;

        // get the snippets for the given field
        final TokenStream tokenStream = TokenSources.getTokenStream(hlField, tvFields, text, analyzer,
                maxStartOffset);
        final TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, false, 10);
        for (TextFragment textFragment : textFragments) {
            if (textFragment != null && textFragment.getScore() > 0) {
                snippets.add(textFragment.toString());
            }
        }

        // if we have snippets then add a highlight result to the QueryResult
        if (snippets.size() > 0) {
            highlights.add(new Highlight(hlField, snippets));
        }
    }

    result.setHighlights(highlights);
}

From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java

License:Open Source License

@Override
public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException {
    IndexSearcher luceneSearcher = null;
    try {/*from  w w w.  j  av a  2s . c o m*/
        final long startTime = System.currentTimeMillis();
        searcherManager.maybeRefresh();
        luceneSearcher = searcherManager.acquire();

        Query luceneQuery = createLuceneQuery(query);

        ScoreDoc after = null;
        final int numSkipDocs = Math.max(0, query.getSkipCount());
        if (numSkipDocs > 0) {
            after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs);
        }

        final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT)
                : RESULT_LIMIT;
        TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true);
        final long totalHitsNum = topDocs.totalHits;

        List<SearchResultEntry> results = newArrayList();
        List<OffsetData> offsetData = Collections.emptyList();
        for (int i = 0; i < topDocs.scoreDocs.length; i++) {
            ScoreDoc scoreDoc = topDocs.scoreDocs[i];
            int docId = scoreDoc.doc;
            Document doc = luceneSearcher.doc(docId);
            if (query.isIncludePositions()) {
                offsetData = new ArrayList<>();
                String txt = doc.get(TEXT_FIELD);
                if (txt != null) {
                    IndexReader reader = luceneSearcher.getIndexReader();

                    TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD,
                            reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1);

                    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
                    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);

                    QueryScorer queryScorer = new QueryScorer(luceneQuery);
                    // TODO think about this constant
                    queryScorer.setMaxDocCharsToAnalyze(1_000_000);
                    TokenStream newStream = queryScorer.init(tokenStream);
                    if (newStream != null) {
                        tokenStream = newStream;
                    }
                    queryScorer.startFragment(null);

                    tokenStream.reset();

                    int startOffset, endOffset;
                    // TODO think about this constant
                    for (boolean next = tokenStream.incrementToken(); next
                            && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) {
                        startOffset = offsetAtt.startOffset();
                        endOffset = offsetAtt.endOffset();

                        if ((endOffset > txt.length()) || (startOffset > txt.length())) {
                            throw new QueryExecutionException("Token " + termAtt.toString()
                                    + " exceeds length of provided text size " + txt.length());
                        }

                        float res = queryScorer.getTokenScore();
                        if (res > 0.0F && startOffset <= endOffset) {
                            String tokenText = txt.substring(startOffset, endOffset);
                            Scanner sc = new Scanner(txt);
                            int lineNum = 1;
                            long len = 0;
                            String foundLine = "";
                            while (sc.hasNextLine()) {
                                foundLine = sc.nextLine();

                                len += foundLine.length();
                                if (len > startOffset) {
                                    break;
                                }
                                lineNum++;
                            }
                            offsetData.add(
                                    new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine));
                        }
                    }
                }
            }

            String filePath = doc.getField(PATH_FIELD).stringValue();
            LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score);
            results.add(new SearchResultEntry(filePath, offsetData));
        }

        final long elapsedTimeMillis = System.currentTimeMillis() - startTime;

        boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum;
        QueryExpression nextPageQueryExpression = null;
        if (hasMoreToRetrieve) {
            nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length);
        }

        return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum)
                .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis)
                .build();
    } catch (ParseException e) {
        throw new InvalidQueryException(e.getMessage(), e);
    } catch (IOException e) {
        throw new QueryExecutionException(e.getMessage(), e);
    } finally {
        try {
            searcherManager.release(luceneSearcher);
        } catch (IOException e) {
            LOG.error(e.getMessage());
        }
    }
}

From source file:tw.com.kyle.luminance.LumReader.java

public TokenStream GetTokenStream(int docId, String field) throws IOException {
    Fields tvFields = reader.getTermVectors(docId);
    if (tvFields.terms(field) != null) {
        TokenStream tokenStream = TokenSources.getTokenStream(field, tvFields, "", new StandardAnalyzer(), -1);
        return tokenStream;
    } else {/*from  ww  w.  ja v a  2  s.c  o m*/
        return null;
    }

}