Java tutorial
/******************************************************************************* * Copyright (c) 2010, 2012 Institute for Dutch Lexicology * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package nl.inl.blacklab.search; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.AbstractSet; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Collector; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Weight; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Bits; import nl.inl.blacklab.analysis.BLDutchAnalyzer; import nl.inl.blacklab.externalstorage.ContentStore; import nl.inl.blacklab.forwardindex.ForwardIndex; import nl.inl.blacklab.index.complex.ComplexFieldUtil; import nl.inl.blacklab.search.indexstructure.ComplexFieldDesc; import nl.inl.blacklab.search.indexstructure.IndexStructure; import nl.inl.blacklab.search.indexstructure.MetadataFieldDesc; import nl.inl.blacklab.search.indexstructure.PropertyDesc; import nl.inl.util.ExUtil; import nl.inl.util.LogUtil; import nl.inl.util.LuceneUtil; import nl.inl.util.VersionFile; /** * The main interface into the BlackLab library. The Searcher object is instantiated with an open * Lucene IndexReader and accesses that index through special methods. * * The Searcher object knows how to access the original contents of indexed fields, either because * the field is a stored field in the Lucene index, or because it knows where else the content can * be found (such as in fixed-length-encoding files, for fast random access). * * Searcher is thread-safe: a single instance may be shared to perform a number of simultaneous * searches. */ public class SearcherImpl extends Searcher implements Closeable { protected static final Logger logger = Logger.getLogger(SearcherImpl.class); /** * The Lucene index reader */ IndexReader reader; /** * The Lucene IndexSearcher, for dealing with non-Span queries (for per-document scoring) */ private IndexSearcher indexSearcher; /** * Directory where our index resides */ private File indexLocation; /** If true, we've just created a new index. New indices cannot be searched, only added to. */ private boolean isEmptyIndex = false; /** The index writer. Only valid in indexMode. */ private IndexWriter indexWriter = null; /** Thread that automatically warms up the forward indices, if enabled. */ private Thread warmUpForwardIndicesThread; /** * Open an index. * * @param indexDir the index directory * @param indexMode if true, open in index mode; if false, open in search mode. * @param createNewIndex if true, delete existing index in this location if it exists. * @param indexTemplateFile JSON file to use as template for index structure / metadata * (if creating new index) * @throws IOException */ SearcherImpl(File indexDir, boolean indexMode, boolean createNewIndex, File indexTemplateFile) throws IOException { this.indexMode = indexMode; if (!indexMode && createNewIndex) throw new RuntimeException("Cannot create new index, not in index mode"); if (!createNewIndex) { if (!indexMode || VersionFile.exists(indexDir)) { if (!isIndex(indexDir)) { throw new IllegalArgumentException( "Not a BlackLab index, or wrong version! " + VersionFile.report(indexDir)); } } } // If we didn't provide log4j.properties on the classpath, initialise it using default settings. LogUtil.initLog4jIfNotAlready(); logger.debug("Constructing Searcher..."); if (indexMode) { logger.debug(" Opening IndexWriter..."); indexWriter = openIndexWriter(indexDir, createNewIndex, null); logger.debug(" Opening corresponding IndexReader..."); reader = DirectoryReader.open(indexWriter, false); } else { // Open Lucene index logger.debug(" Following symlinks..."); Path indexPath = indexDir.toPath(); while (Files.isSymbolicLink(indexPath)) { // Resolve symlinks, as FSDirectory.open() can't handle them indexPath = Files.readSymbolicLink(indexPath); } logger.debug(" Opening IndexReader..."); reader = DirectoryReader.open(FSDirectory.open(indexPath)); } this.indexLocation = indexDir; // Determine the index structure logger.debug(" Determining index structure..."); indexStructure = new IndexStructure(reader, indexDir, createNewIndex, indexTemplateFile); isEmptyIndex = indexStructure.isNewIndex(); // TODO: we need to create the analyzer before opening the index, because // we can't change the analyzer attached to the IndexWriter (and passing a different // analyzer in addDocument() is going away in Lucene 5.x). // For now, if we're in index mode, we re-open the index with the analyzer we determined. logger.debug(" Creating analyzers..."); createAnalyzers(); if (indexMode) { // Re-open the IndexWriter with the analyzer we've created above (see comment above) logger.debug(" Re-opening IndexWriter with newly created analyzers..."); reader.close(); reader = null; indexWriter.close(); indexWriter = null; indexWriter = openIndexWriter(indexDir, createNewIndex, analyzer); logger.debug(" IndexReader too..."); reader = DirectoryReader.open(indexWriter, false); } // Detect and open the ContentStore for the contents field if (!createNewIndex) { logger.debug(" Determining main contents field name..."); ComplexFieldDesc mainContentsField = indexStructure.getMainContentsField(); if (mainContentsField == null) { if (!indexMode) { if (!isEmptyIndex) throw new RuntimeException("Could not detect main contents field"); // Empty index. Set a default name for the contents field. // Searching an empty index will fail and should not be attempted. this.mainContentsFieldName = Searcher.DEFAULT_CONTENTS_FIELD_NAME; } } else { this.mainContentsFieldName = mainContentsField.getName(); // See if we have a punctuation forward index. If we do, // default to creating concordances using that. if (mainContentsField.hasPunctuation()) { hitsSettings.setConcordanceType(ConcordanceType.FORWARD_INDEX); } } // Register content stores logger.debug(" Opening content stores..."); for (String cfn : indexStructure.getComplexFields()) { if (indexStructure.getComplexFieldDesc(cfn).hasContentStore()) { File dir = new File(indexDir, "cs_" + cfn); if (!dir.exists()) { dir = new File(indexDir, "xml"); // OLD, should eventually be removed } if (dir.exists()) { logger.debug(" " + dir + "..."); registerContentStore(cfn, openContentStore(dir, false)); } } } } logger.debug(" Opening IndexSearcher..."); indexSearcher = new IndexSearcher(reader); // Make sure large wildcard/regex expansions succeed logger.debug(" Setting maxClauseCount..."); BooleanQuery.setMaxClauseCount(100000); // Open the forward indices if (!createNewIndex) { logger.debug(" Opening forward indices..."); openForwardIndices(); } logger.debug("Done."); } @Override public boolean isEmpty() { return isEmptyIndex; } private void createAnalyzers() { Map<String, Analyzer> fieldAnalyzers = new HashMap<>(); fieldAnalyzers.put("fromInputFile", getAnalyzerInstance("nontokenizing")); Analyzer baseAnalyzer = getAnalyzerInstance(indexStructure.getDefaultAnalyzerName()); for (String fieldName : indexStructure.getMetadataFields()) { MetadataFieldDesc fd = indexStructure.getMetadataFieldDesc(fieldName); String analyzerName = fd.getAnalyzerName(); if (analyzerName.length() > 0 && !analyzerName.equalsIgnoreCase("DEFAULT")) { Analyzer fieldAnalyzer = getAnalyzerInstance(analyzerName); if (fieldAnalyzer == null) { logger.error("Unknown analyzer name " + analyzerName + " for field " + fieldName); } else { if (fieldAnalyzer != baseAnalyzer) fieldAnalyzers.put(fieldName, fieldAnalyzer); } } } analyzer = new PerFieldAnalyzerWrapper(baseAnalyzer, fieldAnalyzers); } @Override public void rollback() { try { indexWriter.rollback(); indexWriter = null; } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } } @Override public void close() { try { reader.close(); if (indexWriter != null) { indexWriter.commit(); indexWriter.close(); } // See if the forward index warmup thread is running, and if so, stop it if (warmUpForwardIndicesThread != null && warmUpForwardIndicesThread.isAlive()) { warmUpForwardIndicesThread.interrupt(); // Wait for a maximum of a second for the thread to close down gracefully int i = 0; while (warmUpForwardIndicesThread.isAlive() && i < 10) { try { Thread.sleep(100); } catch (InterruptedException e) { // OK } i++; } } super.close(); } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } } @Override public Document document(int doc) { try { if (doc < 0) throw new IllegalArgumentException("Negative document id"); if (doc >= reader.maxDoc()) throw new IllegalArgumentException("Document id >= maxDoc"); return reader.document(doc); } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } } @Override public boolean isDeleted(int doc) { Bits liveDocs = MultiFields.getLiveDocs(reader); return liveDocs != null && !liveDocs.get(doc); } @Override public int maxDoc() { return reader.maxDoc(); } @Override @Deprecated public Scorer findDocScores(Query q) { try { Weight w = indexSearcher.createNormalizedWeight(q, true); LeafReader scrw = SlowCompositeReaderWrapper.wrap(reader); Scorer sc = w.scorer(scrw.getContext(), MultiFields.getLiveDocs(reader)); return sc; } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } } @Override @Deprecated public TopDocs findTopDocs(Query q, int n) { try { return indexSearcher.search(q, n); } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } } @Override public void getCharacterOffsets(int doc, String fieldName, int[] startsOfWords, int[] endsOfWords, boolean fillInDefaultsIfNotFound) { if (startsOfWords.length == 0) return; // nothing to do try { // Determine lowest and highest word position we'd like to know something about. // This saves a little bit of time for large result sets. int minP = -1, maxP = -1; int numStarts = startsOfWords.length; int numEnds = endsOfWords.length; for (int i = 0; i < numStarts; i++) { if (startsOfWords[i] < minP || minP == -1) minP = startsOfWords[i]; if (startsOfWords[i] > maxP) maxP = startsOfWords[i]; } for (int i = 0; i < numEnds; i++) { if (endsOfWords[i] < minP || minP == -1) minP = endsOfWords[i]; if (endsOfWords[i] > maxP) maxP = endsOfWords[i]; } if (minP < 0 || maxP < 0) throw new RuntimeException("Can't determine min and max positions"); String fieldPropName = ComplexFieldUtil.mainPropertyOffsetsField(indexStructure, fieldName); org.apache.lucene.index.Terms terms = reader.getTermVector(doc, fieldPropName); if (terms == null) throw new IllegalArgumentException( "Field " + fieldPropName + " in doc " + doc + " has no term vector"); if (!terms.hasPositions()) throw new IllegalArgumentException( "Field " + fieldPropName + " in doc " + doc + " has no character postion information"); //int lowestPos = -1, highestPos = -1; int lowestPosFirstChar = -1, highestPosLastChar = -1; int total = numStarts + numEnds; boolean[] done = new boolean[total]; // NOTE: array is automatically initialized to zeroes! int found = 0; // Iterate over terms TermsEnum termsEnum = terms.iterator(); while (termsEnum.next() != null) { PostingsEnum dpe = termsEnum.postings(null, null, PostingsEnum.POSITIONS); // Iterate over docs containing this term (NOTE: should be only one doc!) while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { // Iterate over positions of this term in this doc int positionsRead = 0; int numberOfPositions = dpe.freq(); while (positionsRead < numberOfPositions) { int position = dpe.nextPosition(); if (position == -1) break; positionsRead++; // Keep track of the lowest and highest char pos, so // we can fill in the character positions we didn't find int startOffset = dpe.startOffset(); if (startOffset < lowestPosFirstChar || lowestPosFirstChar == -1) { lowestPosFirstChar = startOffset; } int endOffset = dpe.endOffset(); if (endOffset > highestPosLastChar) { highestPosLastChar = endOffset; } // We've calculated the min and max word positions in advance, so // we know we can skip this position if it's outside the range we're interested in. // (Saves a little time for large result sets) if (position < minP || position > maxP) { continue; } for (int m = 0; m < numStarts; m++) { if (!done[m] && position == startsOfWords[m]) { done[m] = true; startsOfWords[m] = startOffset; found++; } } for (int m = 0; m < numEnds; m++) { if (!done[numStarts + m] && position == endsOfWords[m]) { done[numStarts + m] = true; endsOfWords[m] = endOffset; found++; } } // NOTE: we might be tempted to break here if found == total, // but that would foul up our calculation of highestPosLastChar and // lowestPosFirstChar. } } } if (found < total) { if (!fillInDefaultsIfNotFound) throw new RuntimeException("Could not find all character offsets!"); if (lowestPosFirstChar < 0 || highestPosLastChar < 0) throw new RuntimeException("Could not find default char positions!"); for (int m = 0; m < numStarts; m++) { if (!done[m]) startsOfWords[m] = lowestPosFirstChar; } for (int m = 0; m < numEnds; m++) { if (!done[numStarts + m]) endsOfWords[m] = highestPosLastChar; } } } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } } @Override public IndexReader getIndexReader() { return reader; } @Override protected ContentStore openContentStore(String fieldName) { File contentStoreDir = new File(indexLocation, "cs_" + fieldName); ContentStore contentStore = ContentStore.open(contentStoreDir, isEmptyIndex); registerContentStore(fieldName, contentStore); return contentStore; } /** * Opens all the forward indices, to avoid this delay later. * * NOTE: used to be public; now private because it's done automatically when * constructing the Searcher. */ private void openForwardIndices() { for (String field : indexStructure.getComplexFields()) { ComplexFieldDesc fieldDesc = indexStructure.getComplexFieldDesc(field); for (String property : fieldDesc.getProperties()) { PropertyDesc propDesc = fieldDesc.getPropertyDesc(property); if (propDesc.hasForwardIndex()) { // This property has a forward index. Make sure it is open. String fieldProp = ComplexFieldUtil.propertyField(field, property); logger.debug(" " + fieldProp + "..."); getForwardIndex(fieldProp); } } } if (!indexMode) { logger.debug(" Starting thread to build term indices for forward indices..."); // Start a background thread to build term indices warmUpForwardIndicesThread = new Thread(new Runnable() { @Override public void run() { warmUpForwardIndices(); // speed up first call to Terms.indexOf() } }); warmUpForwardIndicesThread.start(); } } @Override protected ForwardIndex openForwardIndex(String fieldPropName) { ForwardIndex forwardIndex; File dir = new File(indexLocation, "fi_" + fieldPropName); // Special case for old BL index with "forward" as the name of the single forward index // (this should be removed eventually) if (!isEmptyIndex && fieldPropName.equals(mainContentsFieldName) && !dir.exists()) { // Default forward index used to be called "forward". Look for that instead. File alt = new File(indexLocation, "forward"); if (alt.exists()) dir = alt; } if (!isEmptyIndex && !dir.exists()) { // Forward index doesn't exist return null; } // Open forward index forwardIndex = ForwardIndex.open(dir, indexMode, getCollator(), isEmptyIndex); forwardIndex.setIdTranslateInfo(reader, fieldPropName); // how to translate from // Lucene // doc to fiid return forwardIndex; } @Override public QueryExecutionContext getDefaultExecutionContext(String fieldName) { ComplexFieldDesc complexFieldDesc = indexStructure.getComplexFieldDesc(fieldName); if (complexFieldDesc == null) throw new IllegalArgumentException("Unknown complex field " + fieldName); PropertyDesc mainProperty = complexFieldDesc.getMainProperty(); if (mainProperty == null) throw new IllegalArgumentException("Main property not found for " + fieldName); String mainPropName = mainProperty.getName(); return new QueryExecutionContext(this, fieldName, mainPropName, defaultCaseSensitive, defaultDiacriticsSensitive); } @Override public String getIndexName() { return indexLocation.toString(); } @Override public IndexWriter openIndexWriter(File indexDir, boolean create, Analyzer useAnalyzer) throws IOException, CorruptIndexException, LockObtainFailedException { if (!indexDir.exists() && create) { indexDir.mkdir(); } Path indexPath = indexDir.toPath(); while (Files.isSymbolicLink(indexPath)) { // Resolve symlinks, as FSDirectory.open() can't handle them indexPath = Files.readSymbolicLink(indexPath); } Directory indexLuceneDir = FSDirectory.open(indexPath); if (useAnalyzer == null) useAnalyzer = new BLDutchAnalyzer(); IndexWriterConfig config = LuceneUtil.getIndexWriterConfig(useAnalyzer, create); IndexWriter writer = new IndexWriter(indexLuceneDir, config); if (create) VersionFile.write(indexDir, "blacklab", "2"); else { if (!isIndex(indexDir)) { throw new IllegalArgumentException( "BlackLab index has wrong type or version! " + VersionFile.report(indexDir)); } } return writer; } @Override public IndexWriter getWriter() { return indexWriter; } @Override public File getIndexDirectory() { return indexLocation; } @Override public void delete(Query q) { if (!indexMode) throw new RuntimeException("Cannot delete documents, not in index mode"); try { // Open a fresh reader to execute the query try (IndexReader freshReader = DirectoryReader.open(indexWriter, false)) { // Execute the query, iterate over the docs and delete from FI and CS. IndexSearcher s = new IndexSearcher(freshReader); Weight w = s.createNormalizedWeight(q, false); try (LeafReader scrw = SlowCompositeReaderWrapper.wrap(freshReader)) { Scorer sc = w.scorer(scrw.getContext(), MultiFields.getLiveDocs(freshReader)); if (sc == null) return; // no matching documents // Iterate over matching docs while (true) { int docId; try { docId = sc.nextDoc(); } catch (IOException e) { throw new RuntimeException(e); } if (docId == DocIdSetIterator.NO_MORE_DOCS) break; Document d = freshReader.document(docId); deleteFromForwardIndices(d); // Delete this document in all content stores contentStores.deleteDocument(d); } } } finally { reader.close(); } // Finally, delete the documents from the Lucene index indexWriter.deleteDocuments(q); } catch (Exception e) { throw new RuntimeException(e); } } @Override @Deprecated public Map<String, Integer> termFrequencies(Query documentFilterQuery, String fieldName, String propName, String altName) { return LuceneUtil.termFrequencies(getIndexSearcher(), documentFilterQuery, fieldName, propName, altName); } @Override @Deprecated public void collectDocuments(Query query, Collector collector) { try { indexSearcher.search(query, collector); } catch (IOException e) { throw new RuntimeException(e); } } @Override public List<String> getFieldTerms(String fieldName, int maxResults) { try { LeafReader srw = SlowCompositeReaderWrapper.wrap(reader); return LuceneUtil.getFieldTerms(srw, fieldName, maxResults); } catch (IOException e) { throw new RuntimeException(e); } } @Override public IndexSearcher getIndexSearcher() { return indexSearcher; } @Override public Set<Integer> docIdSet() { final int maxDoc = reader.maxDoc(); final Bits liveDocs = MultiFields.getLiveDocs(reader); return new AbstractSet<Integer>() { @Override public boolean contains(Object o) { Integer i = (Integer) o; return i < maxDoc && !isDeleted(i); } boolean isDeleted(Integer i) { return liveDocs != null && !liveDocs.get(i); } @Override public boolean isEmpty() { return maxDoc == reader.numDeletedDocs() + 1; } @Override public Iterator<Integer> iterator() { return new Iterator<Integer>() { int current = -1; int next = -1; @Override public boolean hasNext() { if (next < 0) findNext(); return next < maxDoc; } private void findNext() { next = current + 1; while (next < maxDoc && isDeleted(next)) { next++; } } @Override public Integer next() { if (next < 0) findNext(); if (next >= maxDoc) throw new NoSuchElementException(); current = next; next = -1; return current; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public int size() { return maxDoc - reader.numDeletedDocs() - 1; } }; } }