Java tutorial
/******************************************************************************* * Copyright (c) 2011 Tran Nam Quang. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Tran Nam Quang - initial API and implementation *******************************************************************************/ package net.sourceforge.vaticanfetcher.model.search; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.locks.Lock; import net.sourceforge.vaticanfetcher.enums.Msg; import net.sourceforge.vaticanfetcher.enums.ProgramConf; import net.sourceforge.vaticanfetcher.enums.SettingsConf; import net.sourceforge.vaticanfetcher.model.Fields; import net.sourceforge.vaticanfetcher.model.IndexLoadingProblems.CorruptedIndex; import net.sourceforge.vaticanfetcher.model.IndexRegistry; import net.sourceforge.vaticanfetcher.model.IndexRegistry.ExistingIndexesHandler; import net.sourceforge.vaticanfetcher.model.LuceneIndex; import net.sourceforge.vaticanfetcher.model.Path; import net.sourceforge.vaticanfetcher.model.PendingDeletion; import net.sourceforge.vaticanfetcher.model.index.IndexingConfig; import net.sourceforge.vaticanfetcher.model.index.file.FileFactory; import net.sourceforge.vaticanfetcher.model.index.outlook.OutlookMailFactory; import net.sourceforge.vaticanfetcher.model.parse.Parser; import net.sourceforge.vaticanfetcher.util.CheckedOutOfMemoryError; import net.sourceforge.vaticanfetcher.util.Event; import net.sourceforge.vaticanfetcher.util.Util; import net.sourceforge.vaticanfetcher.util.annotations.ImmutableCopy; import net.sourceforge.vaticanfetcher.util.annotations.NotNull; import net.sourceforge.vaticanfetcher.util.annotations.NotThreadSafe; import net.sourceforge.vaticanfetcher.util.annotations.Nullable; import net.sourceforge.vaticanfetcher.util.annotations.ThreadSafe; import net.sourceforge.vaticanfetcher.util.annotations.VisibleForPackageGroup; import net.sourceforge.vaticanfetcher.util.collect.AlphanumComparator; import net.sourceforge.vaticanfetcher.util.collect.LazyList; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.ChainedFilter; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiSearcher; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MultiTermQuery.RewriteMethod; import org.apache.lucene.search.NumericRangeFilter; import org.apache.lucene.search.PrefixFilter; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searchable; import org.apache.lucene.search.TermsFilter; import org.apache.lucene.search.TopDocs; import com.google.common.io.Closeables; /** * A search API on top of the index registry. This class is completely * thread-safe, so usually only one instance of it is needed for handling * concurrent search requests. * <p> * <b>Important</b>: Instances of this class must be disposed after usage by * calling {@link #shutdown()}. * * @author Tran Nam Quang */ @ThreadSafe public final class Searcher { /** A single page of results. */ public static final class ResultPage { /** The result documents for this page. */ @ImmutableCopy public final List<ResultDocument> resultDocuments; /** The zero-based index of this page. */ public final int pageIndex; /** The total number of pages. */ public final int pageCount; /** The total number of result documents across all pages. */ public final int hitCount; private ResultPage(@NotNull List<ResultDocument> resultDocuments, int pageIndex, int pageCount, int hitCount) { this.resultDocuments = Util.checkNotNull(resultDocuments); this.pageIndex = pageIndex; this.pageCount = pageCount; this.hitCount = hitCount; } } private static final int PAGE_SIZE = ProgramConf.Int.WebInterfacePageSize.get(); public static final int MAX_RESULTS = ProgramConf.Int.MaxResultsTotal.get(); private final IndexRegistry indexRegistry; private final FileFactory fileFactory; private final OutlookMailFactory outlookMailFactory; private final Event.Listener<LuceneIndex> addedListener; private final BlockingQueue<List<PendingDeletion>> deletionQueue = new LinkedBlockingQueue<List<PendingDeletion>>(); // guarded by 'this' lock private final Thread deletionThread; // guarded by 'this' lock @NotNull private MultiSearcher luceneSearcher; // guarded by read-write lock @NotNull private List<LuceneIndex> indexes; // guarded by read-write lock @Nullable private volatile IOException ioException; private final Lock readLock; private final Lock writeLock; /** * This method should not be called by clients. Use {@link IndexRegistry#getSearcher()} instead. * * @param corruptedIndexes * A list that will be filled by this constructor with indexes that couldn't be loaded. */ @VisibleForPackageGroup public Searcher(@NotNull IndexRegistry indexRegistry, @NotNull FileFactory fileFactory, @NotNull OutlookMailFactory outlookMailFactory, @NotNull final List<CorruptedIndex> corruptedIndexes) throws IOException { Util.checkNotNull(indexRegistry, fileFactory, outlookMailFactory); this.indexRegistry = indexRegistry; this.fileFactory = fileFactory; this.outlookMailFactory = outlookMailFactory; readLock = indexRegistry.getReadLock(); writeLock = indexRegistry.getWriteLock(); // Handler for index additions addedListener = new Event.Listener<LuceneIndex>() { public void update(LuceneIndex eventData) { replaceLuceneSearcher(); } }; /* This lock could be moved into the indexes handler, but we'll put it here to avoid releasing and reacquiring it. */ writeLock.lock(); try { indexRegistry.addListeners(new ExistingIndexesHandler() { // Handle existing indexes public void handleExistingIndexes(List<LuceneIndex> indexes) { try { corruptedIndexes.addAll(setLuceneSearcher(indexes)); } catch (IOException e) { ioException = e; } } }, addedListener, null); // removedListener is null, see deletion thread below } finally { writeLock.unlock(); } if (ioException != null) throw ioException; // Handler for index removals deletionThread = new Thread(Searcher.class.getName() + " (Approve pending deletions)") { public void run() { while (true) { try { List<PendingDeletion> deletions = deletionQueue.take(); replaceLuceneSearcher(); for (PendingDeletion deletion : deletions) deletion.setApprovedBySearcher(); } catch (InterruptedException e) { break; } } } }; deletionThread.start(); } /** Updates the cached indexes and replaces the current Lucene searcher with a new one. */ @ThreadSafe @VisibleForPackageGroup public void replaceLuceneSearcher() { writeLock.lock(); try { Closeables.close(luceneSearcher, false); setLuceneSearcher(indexRegistry.getIndexes()); } catch (IOException e) { ioException = e; // Will be thrown later } finally { writeLock.unlock(); } } // Caller must close returned searcher @NotNull @NotThreadSafe private List<CorruptedIndex> setLuceneSearcher(@NotNull List<LuceneIndex> indexes) throws IOException { this.indexes = Util.checkNotNull(indexes); Searchable[] searchables = new Searchable[indexes.size()]; LazyList<CorruptedIndex> corrupted = new LazyList<CorruptedIndex>(); for (int i = 0; i < indexes.size(); i++) { LuceneIndex index = indexes.get(i); try { searchables[i] = new IndexSearcher(index.getLuceneDir()); } catch (IOException e) { Util.printErr(e); searchables[i] = new DummySearchable(); corrupted.add(new CorruptedIndex(index, e)); } } luceneSearcher = new MultiSearcher(searchables); return corrupted; } @ImmutableCopy @NotNull @ThreadSafe public List<ResultDocument> search(@NotNull String queryString) throws SearchException, CheckedOutOfMemoryError { /* * Note: For the desktop interface, we'll always search in all available indexes, even those which are unchecked on the filter panel. This * allows the user to re-check the unchecked indexes and see previously hidden results without starting another search. */ // Create Lucene query QueryWrapper queryWrapper = createQuery(queryString); Query query = queryWrapper.query; boolean isPhraseQuery = queryWrapper.isPhraseQuery; /* * Notes regarding the following code: * * 1) Lucene will throw an IOException if the user deletes one or more indexes while a search is * running over the affected indexes. This can happen when two vaticanfetcher instances are running. * * 2) All the information needed for displaying the results must be loaded and returned immediately rather than lazily, because after * the search the user might delete one or more indexes. This also means the result documents must not access the indexes later on. */ readLock.lock(); try { checkIndexesExist(); // Perform search; might throw OutOfMemoryError ScoreDoc[] scoreDocs = luceneSearcher.search(query, MAX_RESULTS).scoreDocs; // Create result documents ResultDocument[] results = new ResultDocument[scoreDocs.length]; for (int i = 0; i < scoreDocs.length; i++) { Document doc = luceneSearcher.doc(scoreDocs[i].doc); float score = scoreDocs[i].score; LuceneIndex index = indexes.get(luceneSearcher.subSearcher(i)); IndexingConfig config = index.getConfig(); results[i] = new ResultDocument(doc, score, query, isPhraseQuery, config, fileFactory, outlookMailFactory); } return Arrays.asList(results); } catch (IllegalArgumentException e) { throw wrapEmptyIndexException(e); } catch (IOException e) { throw new SearchException(e.getMessage()); // TODO i18n } catch (OutOfMemoryError e) { throw new CheckedOutOfMemoryError(e); } finally { readLock.unlock(); } } @NotNull private static SearchException wrapEmptyIndexException(@NotNull IllegalArgumentException e) throws SearchException { /* * Workaround for bug #390: Lucene 3.5 throws this exception if the indexes are empty, i.e. if no documents have been * indexed so far. This happens if the user indexes an empty folder hierarchy with no files in it. Apparently, this * problem has been fixed in Lucene 4.0, so when the Lucene jar is upgraded to 4.0, this workaround may be removed. */ if (e.getMessage().contains("numHits must be > 0")) return new SearchException("No files were indexed."); // not internationalized else throw e; } @ImmutableCopy @NotNull @ThreadSafe public List<ResultDocument> list(@NotNull Set<String> uids) throws SearchException, CheckedOutOfMemoryError { // Construct a filter that only matches documents with the given UIDs TermsFilter uidFilter = new TermsFilter(); String fieldName = Fields.UID.key(); for (String uid : uids) uidFilter.addTerm(new Term(fieldName, uid)); Query query = new MatchAllDocsQuery(); readLock.lock(); try { checkIndexesExist(); // Perform search; might throw OutOfMemoryError ScoreDoc[] scoreDocs = luceneSearcher.search(query, uidFilter, MAX_RESULTS).scoreDocs; // Create result documents ResultDocument[] results = new ResultDocument[scoreDocs.length]; for (int i = 0; i < results.length; i++) { Document doc = luceneSearcher.doc(scoreDocs[i].doc); float score = scoreDocs[i].score; LuceneIndex index = indexes.get(luceneSearcher.subSearcher(i)); IndexingConfig config = index.getConfig(); results[i] = new ResultDocument(doc, score, query, true, config, fileFactory, outlookMailFactory); } // Sort results by title Arrays.sort(results, new Comparator<ResultDocument>() { public int compare(ResultDocument o1, ResultDocument o2) { return AlphanumComparator.ignoreCaseInstance.compare(o1.getTitle(), o2.getTitle()); } }); return Arrays.asList(results); } catch (IllegalArgumentException e) { throw wrapEmptyIndexException(e); } catch (IOException e) { throw new SearchException(e.getMessage()); // TODO i18n } catch (OutOfMemoryError e) { throw new CheckedOutOfMemoryError(e); } finally { readLock.unlock(); } } /** * For the given query, returns the requested page of results. This method should not be called * anymore after {@link #shutdown()} has been called, otherwise an IOException will be thrown. */ @NotNull @ThreadSafe public ResultPage search(@NotNull WebQuery webQuery) throws IOException, SearchException, CheckedOutOfMemoryError { Util.checkNotNull(webQuery); if (ioException != null) throw ioException; List<Filter> filters = new ArrayList<Filter>(3); // Add size filter to filter chain if (webQuery.minSize != null || webQuery.maxSize != null) { filters.add(NumericRangeFilter.newLongRange(Fields.SIZE.key(), webQuery.minSize, webQuery.maxSize, true, true)); } // Add type filter to filter chain if (webQuery.parsers != null) { TermsFilter typeFilter = new TermsFilter(); String fieldName = Fields.PARSER.key(); typeFilter.addTerm(new Term(fieldName, Fields.EMAIL_PARSER)); for (Parser parser : webQuery.parsers) { String parserName = parser.getClass().getSimpleName(); typeFilter.addTerm(new Term(fieldName, parserName)); } filters.add(typeFilter); } // Add location filter to filter chain if (webQuery.indexes != null) { Filter[] indexFilters = new Filter[webQuery.indexes.size()]; int i = 0; for (LuceneIndex index : webQuery.indexes) { Path path = index.getRootFolder().getPath(); String uid = index.getDocumentType().createUniqueId(path); Term prefix = new Term(Fields.UID.key(), uid + "/"); indexFilters[i++] = new PrefixFilter(prefix); } filters.add(new ChainedFilter(indexFilters, ChainedFilter.OR)); } // Construct filter chain Filter filter = filters.size() == 0 ? null : new ChainedFilter(filters.toArray(new Filter[filters.size()]), ChainedFilter.AND); // Create query QueryWrapper queryWrapper = createQuery(webQuery.query); Query query = queryWrapper.query; boolean isPhraseQuery = queryWrapper.isPhraseQuery; readLock.lock(); try { checkIndexesExist(); // Perform search; might throw OutOfMemoryError int maxResults = (webQuery.pageIndex + 1) * PAGE_SIZE; TopDocs topDocs = luceneSearcher.search(query, filter, maxResults); ScoreDoc[] scoreDocs = topDocs.scoreDocs; // Compute start and end indices of returned page int start; int end = scoreDocs.length; if (end <= PAGE_SIZE) { start = 0; } else { int r = end % PAGE_SIZE; start = end - (r == 0 ? PAGE_SIZE : r); } // Create and fill list of result documents to return ResultDocument[] results = new ResultDocument[end - start]; for (int i = start; i < end; i++) { Document doc = luceneSearcher.doc(scoreDocs[i].doc); float score = scoreDocs[i].score; LuceneIndex index = indexes.get(luceneSearcher.subSearcher(i)); IndexingConfig config = index.getConfig(); results[i - start] = new ResultDocument(doc, score, query, isPhraseQuery, config, fileFactory, outlookMailFactory); } int hitCount = topDocs.totalHits; int newPageIndex = start / PAGE_SIZE; int pageCount = (int) Math.ceil((float) hitCount / PAGE_SIZE); return new ResultPage(Arrays.asList(results), newPageIndex, pageCount, hitCount); } catch (IllegalArgumentException e) { throw wrapEmptyIndexException(e); } catch (OutOfMemoryError e) { throw new CheckedOutOfMemoryError(e); } finally { readLock.unlock(); } } @NotNull @ThreadSafe private static QueryWrapper createQuery(@NotNull String queryString) throws SearchException { PhraseDetectingQueryParser queryParser = new PhraseDetectingQueryParser(IndexRegistry.LUCENE_VERSION, Fields.CONTENT.key(), IndexRegistry.analyzer); queryParser.setAllowLeadingWildcard(true); RewriteMethod rewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; queryParser.setMultiTermRewriteMethod(rewriteMethod); if (!SettingsConf.Bool.UseOrOperator.get()) queryParser.setDefaultOperator(QueryParser.AND_OPERATOR); try { Query query = queryParser.parse(queryString); boolean isPhraseQuery = queryParser.isPhraseQuery(); return new QueryWrapper(query, isPhraseQuery); } catch (IllegalArgumentException e) { /* This happens for example when you enter a fuzzy search with similarity >= 1, e.g. "fuzzy~1". */ String msg = Msg.invalid_query.get() + "\n\n" + e.getMessage(); throw new SearchException(msg); } catch (ParseException e) { String msg = Msg.invalid_query.get() + "\n\n" + e.getMessage(); throw new SearchException(msg); } } // Checks that all indexes still exist @NotNull @NotThreadSafe private void checkIndexesExist() throws SearchException { if (indexes.isEmpty()) throw new SearchException("Nothing to search in: No indexes have been created yet."); // TODO i18n for (LuceneIndex index : indexes) { File indexDir = index.getIndexDirPath().getCanonicalFile(); if (indexDir != null && !indexDir.isDirectory()) { String msg = "Folders not found:"; // TODO i18n folders_not_found msg += "\n" + indexDir; throw new SearchException(msg); } } } // Given deletions should not be in the registry anymore, since the receiver will retrieve a fresh set of indexes from the registry before approval @ThreadSafe @VisibleForPackageGroup public void approveDeletions(@NotNull List<PendingDeletion> deletions) { Util.checkNotNull(deletions); if (deletions.isEmpty()) return; /* * If the deletion thread is not available anymore, approve of deletions immediately. - Otherwise the given deletion objects would just * hang around in the queue until program shutdown and never receive approval, thus the associated indexes would never get deleted. */ synchronized (this) { if (deletionThread.isInterrupted()) { for (PendingDeletion pendingDeletion : deletions) pendingDeletion.setApprovedBySearcher(); } else { deletionQueue.add(deletions); } } } /** Disposes of the receiver. The caller should make sure that no more search requests are submitted to the receiver after this method is called. */ @ThreadSafe public void shutdown() { if (ioException != null) Util.printErr(ioException); writeLock.lock(); try { indexRegistry.removeListeners(addedListener, null); Closeables.closeQuietly(luceneSearcher); } finally { writeLock.unlock(); } /* * This should be done after closing the Lucene searcher in order to ensure that no indexes * will be deleted outside the deletion queue while the Lucene searcher is still open. */ synchronized (this) { deletionThread.interrupt(); } } private static final class QueryWrapper { public final Query query; public final boolean isPhraseQuery; private QueryWrapper(@NotNull Query query, boolean isPhraseQuery) { this.query = Util.checkNotNull(query); this.isPhraseQuery = isPhraseQuery; } } }