net.sourceforge.docfetcher.model.search.HighlightService.java Source code

Introduction

Here is the source code for net.sourceforge.docfetcher.model.search.HighlightService.java
Source

/*******************************************************************************
 * Copyright (c) 2011 Tran Nam Quang.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    Tran Nam Quang - initial API and implementation
 *******************************************************************************/

package net.sourceforge.docfetcher.model.search;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import net.sourceforge.docfetcher.model.Fields;
import net.sourceforge.docfetcher.model.IndexRegistry;
import net.sourceforge.docfetcher.model.index.IndexWriterAdapter;
import net.sourceforge.docfetcher.util.CheckedOutOfMemoryError;
import net.sourceforge.docfetcher.util.Util;
import net.sourceforge.docfetcher.util.annotations.MutableCopy;
import net.sourceforge.docfetcher.util.annotations.NotNull;
import net.sourceforge.docfetcher.util.annotations.VisibleForPackageGroup;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldTermStack;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import com.google.common.io.Closeables;

/**
 * @author Tran Nam Quang
 */
@VisibleForPackageGroup
public final class HighlightService {

    private HighlightService() {
    }

    // The given text is trimmed
    @NotNull
    public static HighlightedString highlight(@NotNull Query query, boolean isPhraseQuery, @NotNull String text)
            throws CheckedOutOfMemoryError {
        text = trimDocument(text);
        List<Range> ranges;
        if (isPhraseQuery)
            ranges = highlightPhrases(query, text);
        else
            ranges = highlight(query, text);
        return new HighlightedString(text, ranges);
    }

    /**
     * Trims the given string as follows:
     * <ul>
     * <li>All trailing whitespace is removed.</li>
     * <li>All preceding empty lines are removed. This means that any leading
     * whitespace in the first non-empty line is preserved.</li>
     * </ul>
     */
    @NotNull
    private static String trimDocument(@NotNull String input) {
        input = Util.trimRight(input);
        Integer lineStart = 0;
        for (int i = 0; i < input.length(); i++) {
            char c = input.charAt(i);
            if (c == '\r' || c == '\n') {
                lineStart = i + 1;
            } else if (!Character.isWhitespace(c)) {
                if (lineStart != null)
                    return input.substring(lineStart);
                return input.substring(i);
            }
        }
        return "";
    }

    @MutableCopy
    @NotNull
    @SuppressWarnings("unchecked")
    private static List<Range> highlightPhrases(@NotNull Query query, @NotNull String text)
            throws CheckedOutOfMemoryError {
        // FastVectorHighlighter only supports TermQuery, PhraseQuery and BooleanQuery
        FastVectorHighlighter highlighter = new FastVectorHighlighter(true, true, null, null);
        FieldQuery fieldQuery = highlighter.getFieldQuery(query);
        Directory directory = new RAMDirectory();
        try {
            /*
             * Hack: We have to put the given text in a RAM index, because the
             * fast-vector highlighter can only work on index readers
             */
            IndexWriterAdapter writer = new IndexWriterAdapter(directory);
            Document doc = new Document();
            doc.add(Fields.createContent(text, true)); // must store token positions and offsets
            writer.add(doc);
            Closeables.closeQuietly(writer); // flush unwritten documents into index
            IndexReader indexReader = IndexReader.open(directory);

            // This might throw an OutOfMemoryError
            FieldTermStack fieldTermStack = new FieldTermStack(indexReader, 0, Fields.CONTENT.key(), fieldQuery);

            FieldPhraseList fieldPhraseList = new FieldPhraseList(fieldTermStack, fieldQuery);

            // Hack: We'll use reflection to access a private field
            java.lang.reflect.Field field = fieldPhraseList.getClass().getDeclaredField("phraseList");
            field.setAccessible(true);
            LinkedList<WeightedPhraseInfo> infoList = (LinkedList<WeightedPhraseInfo>) field.get(fieldPhraseList);

            List<Range> ranges = new ArrayList<Range>(infoList.size());
            for (WeightedPhraseInfo phraseInfo : infoList) {
                int start = phraseInfo.getStartOffset();
                int end = phraseInfo.getEndOffset();
                ranges.add(new Range(start, end - start));
            }
            return ranges;
        } catch (OutOfMemoryError e) {
            throw new CheckedOutOfMemoryError(e);
        } catch (Exception e) {
            return new ArrayList<Range>(0);
        }
    }

    @MutableCopy
    @NotNull
    private static List<Range> highlight(@NotNull Query query, @NotNull String text)
            throws CheckedOutOfMemoryError {
        final List<Range> ranges = new ArrayList<Range>();
        /*
         * A formatter is supposed to return formatted text, but since we're
         * only interested in the start and end offsets of the search terms, we
         * return null and store the offsets in a list.
         */
        Formatter nullFormatter = new Formatter() {
            public String highlightTerm(String originalText, TokenGroup tokenGroup) {
                for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                    Token token = tokenGroup.getToken(i);
                    if (tokenGroup.getScore(i) == 0)
                        continue;
                    int start = token.startOffset();
                    int end = token.endOffset();
                    ranges.add(new Range(start, end - start));
                }
                return null;
            }
        };
        String key = Fields.CONTENT.key();
        Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
        highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
        highlighter.setTextFragmenter(new NullFragmenter());
        try {
            /*
             * This has a return value, but we ignore it since we only want the
             * offsets. Might throw an OutOfMemoryError.
             */
            highlighter.getBestFragment(IndexRegistry.getAnalyzer(), key, text);
        } catch (OutOfMemoryError e) {
            throw new CheckedOutOfMemoryError(e);
        } catch (Exception e) {
            Util.printErr(e);
        }
        return ranges;
    }

}