Java tutorial
/** * Copyright 2009 Welocalize, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.globalsight.ling.lucene.highlight; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.PriorityQueue; import com.globalsight.ling.tm2.lucene.LuceneUtil; /** * Class used to markup highlighted terms found in the best sections * of a text, using configurable {@link Fragmenter}, {@link Scorer}, * {@link Formatter} and tokenizers. * @author mark@searcharea.co.uk */ public class Highlighter { static public final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE = 50 * 1024; private int maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE; private Formatter formatter; private Fragmenter textFragmenter = new SimpleFragmenter(); private Scorer fragmentScorer = null; public Highlighter(Scorer fragmentScorer) { this(new SimpleHTMLFormatter(), fragmentScorer); } public Highlighter(Formatter formatter, Scorer fragmentScorer) { this.formatter = formatter; this.fragmentScorer = fragmentScorer; } /** * Highlights chosen terms in a text, extracting the most relevant * section. The document text is analysed in chunks to record hit * statistics across the document. After accumulating stats, the * fragment with the highest score is returned. * * @param tokenStream a stream of tokens identified in the text * parameter, including offset information. * * This is typically produced by an analyzer re-parsing a document's * text. Some work may be done on retrieving TokenStreams more efficently * by adding support for storing original text position data in the Lucene * index but this support is not currently available (as of Lucene 1.4 rc2). * * @param text text to highlight terms in * * @return highlighted text fragment or null if no terms found */ public final String getBestFragment(TokenStream tokenStream, String text) throws IOException { String[] results = getBestFragments(tokenStream, text, 1); if (results.length > 0) { return results[0]; } return null; } /** * Highlights chosen terms in a text, extracting the most relevant sections. * * The document text is analysed in chunks to record hit * statistics across the document. After accumulating stats, the * fragments with the highest scores are returned as an array of * strings in order of score (contiguous fragments are merged into * one in their original order to improve readability) * * @param text text to highlight terms in * @param maxNumFragments the maximum number of fragments. * * @return highlighted text fragments (between 0 and * maxNumFragments number of fragments) */ public final String[] getBestFragments(TokenStream tokenStream, String text, int maxNumFragments) throws IOException { maxNumFragments = Math.max(1, maxNumFragments); //sanity check TextFragment[] frag = getBestTextFragments(tokenStream, text, true, maxNumFragments); //Get text ArrayList fragTexts = new ArrayList(); int n = 0; for (int i = 0; i < frag.length; i++) { if ((frag[i] != null) && (frag[i].getScore() > 0)) { fragTexts.add(frag[i].toString()); } } return (String[]) fragTexts.toArray(new String[0]); } /** * Highlights terms in the text, extracting the most relevant * sections and concatenating the chosen fragments with a * separator (typically "..."). * * The document text is analysed in chunks to record hit * statistics across the document. After accumulating stats, the * fragments with the highest scores are returned in order as * "separator" delimited strings. * * @param text text to highlight terms in * @param maxNumFragments the maximum number of fragments. * @param separator the separator used to intersperse the document * fragments (typically "...") * * @return highlighted text */ public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments, String separator) throws IOException { String sections[] = getBestFragments(tokenStream, text, maxNumFragments); StringBuffer result = new StringBuffer(); for (int i = 0; i < sections.length; i++) { if (i > 0) { result.append(separator); } result.append(sections[i]); } return result.toString(); } /** * Low level api to get the most relevant (formatted) sections of * the document. * * This method has been made public to allow visibility of score * information held in TextFragment objects. Thanks to Jason * Calabrese for help in redefining the interface. * @param tokenStream * @param text * @param maxNumFragments * @param mergeContiguousFragments * @return * @throws IOException */ public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException { ArrayList docFrags = new ArrayList(); StringBuffer newText = new StringBuffer(); TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { org.apache.lucene.analysis.Token token; String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.start(text); TokenGroup tokenGroup = new TokenGroup(); while ((token = LuceneUtil.getNextToken(tokenStream)) != null) { if (tokenGroup.numTokens > 0 && tokenGroup.isDistinct(token)) { // the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.startOffset; endOffset = tokenGroup.endOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(tokenText, tokenGroup); // store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(text.substring(lastEndOffset, startOffset)); newText.append(markedUpText); lastEndOffset = endOffset; tokenGroup.clear(); // check if current token marks the start of a new fragment if (textFragmenter.isNewFragment(token)) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.length(); currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); } } tokenGroup.addToken(token, fragmentScorer.getTokenScore(token)); if (lastEndOffset > maxDocBytesToAnalyze) { break; } } currentFrag.setScore(fragmentScorer.getFragmentScore()); if (tokenGroup.numTokens > 0) { // flush the accumulated text (same code as in above loop) startOffset = tokenGroup.startOffset; endOffset = tokenGroup.endOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(tokenText, tokenGroup); // store any whitespace etc from between this and last group if (startOffset > lastEndOffset) { newText.append(text.substring(lastEndOffset, startOffset)); } newText.append(markedUpText); lastEndOffset = endOffset; } // append text after end of last token if (lastEndOffset < text.length()) { newText.append(text.substring(lastEndOffset)); } currentFrag.textEndPos = newText.length(); // sort the most relevant sections of the text for (int i = 0, max = docFrags.size(); i < max; i++) { currentFrag = (TextFragment) docFrags.get(i); fragQueue.insertWithOverflow(currentFrag); } // return the most relevant fragments TextFragment result[] = new TextFragment[fragQueue.size()]; for (int i = result.length - 1; i >= 0; i--) { result[i] = (TextFragment) fragQueue.pop(); } // merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(result); ArrayList fragTexts = new ArrayList(); for (int i = 0; i < result.length; i++) { if (result[i] != null && result[i].getScore() > 0) { fragTexts.add(result[i]); } } result = (TextFragment[]) fragTexts.toArray(new TextFragment[0]); } return result; } finally { if (tokenStream != null) { try { tokenStream.close(); } catch (Exception e) { } } } } /** Improves readability of a score-sorted list of TextFragments * by merging any fragments that were contiguous in the original * text into one larger fragment with the correct order. This * will leave a "null" in the array entry for the lesser scored * fragment. * * @param frag An array of document fragments in descending score */ private void mergeContiguousFragments(TextFragment[] frag) { boolean mergingStillBeingDone; if (frag.length > 1) do { mergingStillBeingDone = false; //initialise loop control flag //for each fragment, scan other frags looking for contiguous blocks for (int i = 0; i < frag.length; i++) { if (frag[i] == null) { continue; } //merge any contiguous blocks for (int x = 0; x < frag.length; x++) { if (frag[x] == null) { continue; } if (frag[i] == null) { break; } TextFragment frag1 = null; TextFragment frag2 = null; int frag1Num = 0; int frag2Num = 0; int bestScoringFragNum; int worstScoringFragNum; //if blocks are contiguous.... if (frag[i].follows(frag[x])) { frag1 = frag[x]; frag1Num = x; frag2 = frag[i]; frag2Num = i; } else if (frag[x].follows(frag[i])) { frag1 = frag[i]; frag1Num = i; frag2 = frag[x]; frag2Num = x; } //merging required.. if (frag1 != null) { if (frag1.getScore() > frag2.getScore()) { bestScoringFragNum = frag1Num; worstScoringFragNum = frag2Num; } else { bestScoringFragNum = frag2Num; worstScoringFragNum = frag1Num; } frag1.merge(frag2); frag[worstScoringFragNum] = null; mergingStillBeingDone = true; frag[bestScoringFragNum] = frag1; } } } } while (mergingStillBeingDone); } /** * @return the maximum number of bytes to be tokenized per doc */ public int getMaxDocBytesToAnalyze() { return maxDocBytesToAnalyze; } /** * @param byteCount the maximum number of bytes to be tokenized per doc * (This can improve performance with large documents) */ public void setMaxDocBytesToAnalyze(int byteCount) { maxDocBytesToAnalyze = byteCount; } public Fragmenter getTextFragmenter() { return textFragmenter; } public void setTextFragmenter(Fragmenter fragmenter) { textFragmenter = fragmenter; } public Scorer getFragmentScorer() { return fragmentScorer; } public void setFragmentScorer(Scorer scorer) { fragmentScorer = scorer; } } class FragmentQueue<T> extends PriorityQueue<T> { public FragmentQueue(int size) { super(size); } public final boolean lessThan(Object a, Object b) { TextFragment fragA = (TextFragment) a; TextFragment fragB = (TextFragment) b; if (fragA.getScore() == fragB.getScore()) return fragA.fragNum > fragB.fragNum; else return fragA.getScore() < fragB.getScore(); } }