Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>. */ package org.opengrok.indexer.search.context; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.uhighlight.PhraseHelper; import org.apache.lucene.search.uhighlight.UnifiedHighlighter; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.opengrok.indexer.analysis.AnalyzerGuru; import org.opengrok.indexer.analysis.ExpandTabsReader; import org.opengrok.indexer.analysis.StreamSource; import org.opengrok.indexer.configuration.RuntimeEnvironment; import org.opengrok.indexer.logger.LoggerFactory; import org.opengrok.indexer.search.QueryBuilder; import org.opengrok.indexer.util.IOUtils; import org.opengrok.indexer.web.Util; /** * Represents a subclass of {@link UnifiedHighlighter} with customizations for * OpenGrok. */ public class OGKUnifiedHighlighter extends UnifiedHighlighter { private static final Logger LOGGER = LoggerFactory.getLogger(OGKUnifiedHighlighter.class); private final RuntimeEnvironment env; private int tabSize; private String fileTypeName; /** * Initializes an instance with * {@link UnifiedHighlighter#UnifiedHighlighter(org.apache.lucene.search.IndexSearcher, org.apache.lucene.analysis.Analyzer)} * for the specified {@code indexSearcher} and {@code indexAnalyzer}, and * stores the {@code env} for later use. * @param env a required instance * @param indexSearcher a required instance * @param indexAnalyzer a required instance * @throws IllegalArgumentException if any argument is null */ public OGKUnifiedHighlighter(RuntimeEnvironment env, IndexSearcher indexSearcher, Analyzer indexAnalyzer) { super(indexSearcher, indexAnalyzer); if (env == null) { throw new IllegalArgumentException("env is null"); } this.env = env; } /** * Gets a file type name-specific analyzer during the execution of * {@link #highlightFieldsUnion(java.lang.String[], org.apache.lucene.search.Query, int, int)}, * or just gets the object passed in to the constructor at all other times. * @return a defined instance */ @Override public Analyzer getIndexAnalyzer() { String ftname = fileTypeName; if (ftname == null) { return indexAnalyzer; } Analyzer fa = AnalyzerGuru.getAnalyzer(ftname); return fa == null ? indexAnalyzer : fa; } public int getTabSize() { return tabSize; } public void setTabSize(int value) { this.tabSize = value; } /** * Transiently arranges that {@link #getIndexAnalyzer()} returns a file type * name-specific analyzer during a subsequent call of * {@link #highlightFieldsUnionWork(java.lang.String[], org.apache.lucene.search.Query, int, int)}. * @param fields a defined instance * @param query a defined instance * @param docId a valid document ID * @param lineLimit the maximum number of lines to return * @return a defined instance or else {@code null} if there are no results * @throws IOException if accessing the Lucene document fails */ public String highlightFieldsUnion(String[] fields, Query query, int docId, int lineLimit) throws IOException { /** * Setting fileTypeName has to happen before getFieldHighlighter() is * called by highlightFieldsAsObjects() so that the result of * getIndexAnalyzer() (if it is called due to requiring ANALYSIS) can be * influenced by fileTypeName. */ Document doc = searcher.doc(docId); fileTypeName = doc == null ? null : doc.get(QueryBuilder.TYPE); try { return highlightFieldsUnionWork(fields, query, docId, lineLimit); } finally { fileTypeName = null; } } /** * Calls * {@link #highlightFieldsAsObjects(java.lang.String[], org.apache.lucene.search.Query, int[], int[])}, * and merges multiple passages if the formatter returns * {@link FormattedLines} or else returns the first formatted result. * @param fields a defined instance * @param query a defined instance * @param docId a valid document ID * @param lineLimit the maximum number of lines to return * @return a defined instance or else {@code null} if there are no results * @throws IOException if accessing the Lucene document fails */ protected String highlightFieldsUnionWork(String[] fields, Query query, int docId, int lineLimit) throws IOException { int[] maxPassagesCopy = new int[fields.length]; /** * N.b. linelimit + 1 so that the ContextFormatter has an indication * when to display the "more..." link. */ Arrays.fill(maxPassagesCopy, lineLimit + 1); FormattedLines res = null; Map<String, Object[]> mappedRes = highlightFieldsAsObjects(fields, query, new int[] { docId }, maxPassagesCopy); for (Object[] flinesz : mappedRes.values()) { for (Object obj : flinesz) { /** * Empirical testing showed that the passage could be null if * the original source text is not available to the highlighter. */ if (obj != null) { if (!(obj instanceof FormattedLines)) { return obj.toString(); } FormattedLines flines = (FormattedLines) obj; res = res == null ? flines : res.merge(flines); } } } if (res == null) { return null; } if (res.getCount() > lineLimit) { res.setLimited(true); while (res.getCount() > lineLimit) { res.pop(); } } return res.toString(); } /** * Produces original text by reading from OpenGrok source content relative * to {@link RuntimeEnvironment#getSourceRootPath()} and returns the content * for each document if the timestamp matches -- or else just {@code null} * for a missing file or a timestamp mismatch (as "the returned Strings must * be identical to what was indexed.") * <p> * "This method must load fields for at least one document from the given * {@link DocIdSetIterator} but need not return all of them; by default the * character lengths are summed and this method will return early when * {@code cacheCharsThreshold} is exceeded. Specifically if that number is * 0, then only one document is fetched no matter what. Values in the array * of {@link CharSequence} will be {@code null} if no value was found." * @return a defined instance * @throws IOException if an I/O error occurs */ @Override protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException { List<CharSequence[]> docListOfFields = new ArrayList<>( cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost())); int sumChars = 0; do { int docId = docIter.nextDoc(); if (docId == DocIdSetIterator.NO_MORE_DOCS) { break; } Document doc = searcher.doc(docId); String path = doc.get(QueryBuilder.PATH); String storedU = doc.get(QueryBuilder.U); String content = getRepoFileContent(path, storedU); CharSequence[] seqs = new CharSequence[fields.length]; Arrays.fill(seqs, content); docListOfFields.add(seqs); if (content != null) { sumChars += content.length(); } } while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0); return docListOfFields; } /** * Returns the value from the {@code super} implementation, with logging for * ANALYSIS of any field but {@link QueryBuilder#FULL} or * {@link QueryBuilder#REFS}. * @return the value from the {@code super} implementation */ @Override protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { OffsetSource res = super.getOptimizedOffsetSource(field, terms, phraseHelper, automata); if (res == OffsetSource.ANALYSIS) { /** * Testing showed that UnifiedHighlighter falls back to * ANALYSIS in the presence of multi-term queries (MTQs) such as * prefixes and wildcards even for fields that are analyzed with * POSTINGS -- i.e. with DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS. * This is despite UnifiedHighlighter seeming to indicate that * postings should be sufficient in the comment for * shouldHandleMultiTermQuery(String): "MTQ highlighting can be * expensive, particularly when using offsets in postings." * DEFS are stored with term vectors to avoid this problem, * since re-analysis would not at all accord with ctags Definitions. * For FULL and REFS, highlightFieldsUnion() arranges that * getIndexAnalyzer() can return a TYPE-specific analyzer for use by * getOffsetStrategy() -- if re-ANALYSIS is required. */ switch (field) { case QueryBuilder.FULL: case QueryBuilder.REFS: // Acceptable -- as described above. break; default: if (LOGGER.isLoggable(Level.FINE)) { OffsetSource defaultRes = getOffsetSource(field); LOGGER.log(Level.FINE, "Field {0} using {1} vs {2}", new Object[] { field, res, defaultRes }); } break; } } return res; } private String getRepoFileContent(String repoRelPath, String storedU) throws IOException { if (storedU == null) { LOGGER.log(Level.FINE, "Missing U[UID] for: {0}", repoRelPath); return null; } String repoAbsPath = env.getSourceRootPath() + repoRelPath; File repoAbsFile = new File(repoAbsPath); if (!repoAbsFile.exists()) { LOGGER.log(Level.FINE, "Missing file: {0}", repoAbsPath); return null; } repoRelPath = Util.fixPathIfWindows(repoRelPath); // Verify that timestamp (U) is unchanged by comparing UID. String uid = Util.path2uid(repoRelPath, DateTools.timeToString(repoAbsFile.lastModified(), DateTools.Resolution.MILLISECOND)); BytesRef buid = new BytesRef(uid); BytesRef storedBuid = new BytesRef(storedU); if (storedBuid.compareTo(buid) != 0) { LOGGER.log(Level.FINE, "Last-modified differs for: {0}", repoRelPath); return null; } StringBuilder bld = new StringBuilder(); StreamSource src = StreamSource.fromFile(repoAbsFile); try (InputStream in = src.getStream(); Reader rdr = getReader(in)) { int c; while ((c = rdr.read()) != -1) { bld.append((char) c); } } return bld.toString(); } private Reader getReader(InputStream in) throws IOException { Reader bsrdr = IOUtils.createBOMStrippedReader(in, StandardCharsets.UTF_8.name()); BufferedReader bufrdr = new BufferedReader(bsrdr); return ExpandTabsReader.wrap(bufrdr, tabSize); } }