org.apache.solr.highlight.LuceneRegexFragmenter.java Source code

Introduction

Here is the source code for org.apache.solr.highlight.LuceneRegexFragmenter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.highlight;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.search.highlight.Fragmenter;

/**
 * Fragmenter that tries to produce snippets that "look" like a regular 
 * expression.
 *
 * NOTE: the default for <code>maxAnalyzedChars</code> is much lower for this 
 * fragmenter.  After this limit is exhausted, fragments are produced in the
 * same way as <code>GapFragmenter</code>
 */
class LuceneRegexFragmenter implements Fragmenter {
    // ** defaults
    public static final int DEFAULT_FRAGMENT_SIZE = 70;
    public static final int DEFAULT_INCREMENT_GAP = 50;
    public static final float DEFAULT_SLOP = 0.6f;
    public static final int DEFAULT_MAX_ANALYZED_CHARS = 10000;

    // ** settings

    // desired length of fragments, in characters
    protected int targetFragChars;
    // increment gap which indicates a new fragment should occur 
    // (often due to multi-valued fields)
    protected int incrementGapThreshold;
    // factor by which we are allowed to bend the frag size (larger or smaller)
    protected float slop;
    // analysis limit (ensures we don't waste too much time on long fields)
    protected int maxAnalyzedChars;
    // default desirable pattern for text fragments.
    protected Pattern textRE;

    // ** state
    protected int currentNumFrags;
    protected int currentOffset;
    protected int targetOffset;
    protected int[] hotspots;

    private PositionIncrementAttribute posIncAtt;
    private OffsetAttribute offsetAtt;

    // ** other
    // note: could dynamically change size of sentences extracted to match
    // target frag size
    public static final String DEFAULT_PATTERN_RAW = "[-\\w ,\\n\"']{20,200}";
    public static final Pattern DEFAULT_PATTERN = Pattern.compile(DEFAULT_PATTERN_RAW);

    public LuceneRegexFragmenter() {
        this(DEFAULT_FRAGMENT_SIZE, DEFAULT_INCREMENT_GAP, DEFAULT_SLOP, DEFAULT_MAX_ANALYZED_CHARS);
    }

    public LuceneRegexFragmenter(int targetFragChars) {
        this(targetFragChars, DEFAULT_INCREMENT_GAP, DEFAULT_SLOP, DEFAULT_MAX_ANALYZED_CHARS);
    }

    public LuceneRegexFragmenter(int targetFragChars, int incrementGapThreshold, float slop, int maxAnalyzedChars) {
        this(targetFragChars, incrementGapThreshold, slop, maxAnalyzedChars, DEFAULT_PATTERN);

    }

    public LuceneRegexFragmenter(int targetFragChars, int incrementGapThreshold, float slop, int maxAnalyzedChars,
            Pattern targetPattern) {
        this.targetFragChars = targetFragChars;
        this.incrementGapThreshold = incrementGapThreshold;
        this.slop = slop;
        this.maxAnalyzedChars = maxAnalyzedChars;
        this.textRE = targetPattern;
    }

    /* (non-Javadoc)
     * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
     */
    @Override
    public void start(String originalText, TokenStream tokenStream) {
        currentNumFrags = 1;
        currentOffset = 0;
        addHotSpots(originalText);
        posIncAtt = tokenStream.getAttribute(PositionIncrementAttribute.class);
        offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
    }

    ////////////////////////////////////
    // pre-analysis
    ////////////////////////////////////

    protected void addHotSpots(String text) {
        //System.out.println("hot spotting");
        ArrayList<Integer> temphs = new ArrayList<>(text.length() / targetFragChars);
        Matcher match = textRE.matcher(text);
        int cur = 0;
        while (match.find() && cur < maxAnalyzedChars) {
            int start = match.start(), end = match.end();
            temphs.add(start);
            temphs.add(end);
            cur = end;
            //System.out.println("Matched " + match.group());
        }
        hotspots = new int[temphs.size()];
        for (int i = 0; i < temphs.size(); i++) {
            hotspots[i] = temphs.get(i);
        }
        // perhaps not necessary--I don't know if re matches are non-overlapping
        Arrays.sort(hotspots);
    }

    ////////////////////////////////////
    // fragmenting
    ////////////////////////////////////

    /* (non-Javadoc)
     * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
     */
    @Override
    public boolean isNewFragment() {
        boolean isNewFrag = false;
        int minFragLen = (int) ((1.0f - slop) * targetFragChars);
        int endOffset = offsetAtt.endOffset();

        // ** determin isNewFrag
        if (posIncAtt.getPositionIncrement() > incrementGapThreshold) {
            // large position gaps always imply new fragments
            isNewFrag = true;

        } else if (endOffset - currentOffset < minFragLen) {
            // we're not in our range of flexibility
            isNewFrag = false;

        } else if (targetOffset > 0) {
            // we've already decided on a target
            isNewFrag = endOffset > targetOffset;

        } else {
            // we might be able to do something
            int minOffset = currentOffset + minFragLen;
            int maxOffset = (int) (currentOffset + (1.0f + slop) * targetFragChars);
            int hotIndex;

            // look for a close hotspot
            hotIndex = Arrays.binarySearch(hotspots, endOffset);
            if (hotIndex < 0)
                hotIndex = -hotIndex;
            if (hotIndex >= hotspots.length) {
                // no more hotspots in this input stream
                targetOffset = currentOffset + targetFragChars;

            } else if (hotspots[hotIndex] > maxOffset) {
                // no hotspots within slop
                targetOffset = currentOffset + targetFragChars;

            } else {
                // try to find hotspot in slop
                int goal = hotspots[hotIndex];
                while (goal < minOffset && hotIndex < hotspots.length) {
                    hotIndex++;
                    goal = hotspots[hotIndex];
                }
                targetOffset = goal <= maxOffset ? goal : currentOffset + targetFragChars;
            }

            isNewFrag = endOffset > targetOffset;
        }

        // ** operate on isNewFrag
        if (isNewFrag) {
            currentNumFrags++;
            currentOffset = endOffset;
            targetOffset = -1;
        }
        return isNewFrag;
    }

}