Java tutorial
/* * Copyright (c) 2009 Andrejs Jermakovics. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Andrejs Jermakovics - initial implementation */ package it.unibz.instasearch.indexing; import static it.unibz.instasearch.InstaSearchPlugin.debug; import it.unibz.instasearch.InstaSearchPlugin; import it.unibz.instasearch.indexing.StorageIndexer.IndexChangeListener; import it.unibz.instasearch.indexing.querying.CSVExpander; import it.unibz.instasearch.indexing.querying.CurrentProjectSetter; import it.unibz.instasearch.indexing.querying.FieldAliasConverter; import it.unibz.instasearch.indexing.querying.FileNameSearcher; import it.unibz.instasearch.indexing.querying.FilterSetter; import it.unibz.instasearch.indexing.querying.FolderSearcher; import it.unibz.instasearch.indexing.querying.LastTermQueryPrefixer; import it.unibz.instasearch.indexing.querying.LowercaseConverter; import it.unibz.instasearch.indexing.querying.ModifiedTimeConverter; import it.unibz.instasearch.indexing.querying.QueryFuzzifier; import it.unibz.instasearch.indexing.querying.UppercaseNameExpander; import it.unibz.instasearch.indexing.querying.VisitableQuery; import it.unibz.instasearch.indexing.querying.WorkingSetExpander; import it.unibz.instasearch.prefs.PreferenceConstants; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser.Operator; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery.TooManyClauses; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.QueryTermExtractor; import org.apache.lucene.search.highlight.WeightedTerm; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.eclipse.jface.util.IPropertyChangeListener; import org.eclipse.jface.util.PropertyChangeEvent; /** * Searcher for searching the index using SearchQuery */ @SuppressWarnings("deprecation") public class Searcher implements IPropertyChangeListener, IndexChangeListener { /** @see QueryParser#setPhraseSlop(int) */ private static final int DEFAULT_PHRASE_SLOP = 10; /** Minimum length of query in characters */ public static final int MIN_QUERY_LENGTH = 2; /** Minimum number of characters to be considered a word in indexing */ public static final int MIN_WORD_LENGTH = 1; /** Character that identifies the current project in search query */ public static final String CURRENT_PROJECT_CHAR = "."; private IndexSearcher indexSearcher; public static final LengthNormSimilarity SIMILARITY = new LengthNormSimilarity(); //TODO: share with WorkspaceIndexer private static final QueryAnalyzer queryAnalyzer = new QueryAnalyzer(MIN_WORD_LENGTH); // Query visitors that modify the search query private CSVExpander csvExpander = new CSVExpander(); private WorkingSetExpander workingSetExpander = new WorkingSetExpander(); private UppercaseNameExpander uppercaseNameExpander = new UppercaseNameExpander(); private FieldAliasConverter fieldAliasConverter = new FieldAliasConverter(); private ModifiedTimeConverter modifiedTimeConverter = new ModifiedTimeConverter(); private LowercaseConverter lowercaseConverter = new LowercaseConverter(); private FolderSearcher folderSearcher = new FolderSearcher(); private FileNameSearcher fileNameSearcher = new FileNameSearcher(); private CurrentProjectSetter currentProjectSetter = new CurrentProjectSetter(); private QueryFuzzifier queryFuzzifier = new QueryFuzzifier(); private LastTermQueryPrefixer lastTermQueryPrefixer = new LastTermQueryPrefixer(MIN_QUERY_LENGTH + 1); private FilterSetter filterSetter = new FilterSetter(); private boolean showMatchCounts = true; private boolean fuzzySearchAuto = true; /** * Searcher */ public Searcher() { initPrefs(); } protected Directory getIndexDir() throws IOException { return FSDirectory.getDirectory(InstaSearchPlugin.getIndexDirLocation(), false); } private SearchResult searchIndex(SearchQuery searchQuery) throws Exception { IndexSearcher indexSearcher = getIndexSearcher(); IndexReader reader = indexSearcher.getIndexReader(); boolean exact = searchQuery.isExact(); Query query = null; try { query = parseSearchQuery(searchQuery, reader, exact, true); } catch (TooManyClauses e) { // too many, try without prefix search query = parseSearchQuery(searchQuery, reader, exact, false); } catch (ParseException e) { // remove special query characters String newSearchString = searchQuery.getSearchString().replaceAll("[\\(\\)\"\\[\\]'\\{\\}]", " "); try { searchQuery.setSearchString(newSearchString); query = parseSearchQuery(searchQuery, reader, exact, true); } catch (ParseException ignored) { // can have error while typing query, just ignore debug(newSearchString, " - ", ignored.getMessage()); return null; } } SearchResult searchResut = collectSearchResults(searchQuery, indexSearcher, reader, query); return searchResut; } private SearchResult collectSearchResults(SearchQuery searchQuery, IndexSearcher indexSearcher, IndexReader reader, Query query) throws IOException { int maxResults = reader.numDocs(); // all documents if (searchQuery.isLimited()) maxResults = searchQuery.getMaxResults(); Map<String, Float> searchTerms = extractTerms(query); TopDocCollector collector = new TopDocCollector(maxResults); if (searchQuery.isCanceled()) return null; indexSearcher.search(query, collector); // do the actual search if (collector.getTotalHits() == 0) return null; ScoreDoc[] hits = collector.topDocs().scoreDocs; ArrayList<SearchResultDoc> resultDocs = new ArrayList<SearchResultDoc>(hits.length); for (int i = 0; i < hits.length && !searchQuery.isCanceled(); i++) { int docId = hits[i].doc; float score = hits[i].score; Document doc = reader.document(docId); SearchResultDoc resultDoc = new SearchResultDoc(getIndexDir(), doc, docId, score); if (showMatchCounts) resultDoc.computeMatchCount(reader, searchTerms.keySet()); resultDocs.add(resultDoc); } return new SearchResult(searchQuery, resultDocs, searchTerms); } /** * Performs a search using the search query * @param searchQuery * * @return SearchResult or null if nothing is found * @throws Exception */ public SearchResult search(SearchQuery searchQuery) throws Exception { String searchString = searchQuery.getSearchString(); if (searchString == null || searchString.length() < MIN_QUERY_LENGTH) return null; SearchResult result = null; if (searchQuery.isFuzzy()) { searchQuery.setExact(false); result = searchIndex(searchQuery); // search fuzzy } else { // 1. search exact text if (searchQuery.isExact()) result = searchIndex(searchQuery); // 2. split search text into tokens and search (non-exact) if ((result == null || result.isEmpty()) && !searchQuery.isCanceled()) { searchQuery.setExact(false); result = searchIndex(searchQuery); } // 3. search wildcarded and fuzzy matches if ((result == null || result.isEmpty()) && fuzzySearchAuto && !searchQuery.isCanceled()) { // if fuzzy search enabled searchQuery.setFuzzy(true); searchQuery.setExact(false); result = searchIndex(searchQuery); } } return result; } /** * @return the indexSearcher * @throws IOException */ private IndexSearcher getIndexSearcher() throws IOException { if (indexSearcher == null) { indexSearcher = new IndexSearcher(getIndexDir(), true); indexSearcher.setSimilarity(SIMILARITY); } // indexSearcher.getIndexReader().isCurrent() return indexSearcher; } /** * Get all terms that start with prefixText * @param prefixText * @param prefixField * * @return proposed terms * @throws IOException */ public List<String> getProposals(String prefixText, Field prefixField) throws IOException { ArrayList<String> proposals = new ArrayList<String>(); IndexReader reader = getIndexSearcher().getIndexReader(); Term prefix = prefixField.createTerm(prefixText); TermEnum enumerator = reader.terms(prefix); prefixText = prefixText.toLowerCase(); try { do { Term term = enumerator.term(); if (term != null && term.text().toLowerCase().startsWith(prefixText) && term.field().equalsIgnoreCase(prefixField.toString())) { proposals.add(term.text()); } else { break; } } while (enumerator.next()); } finally { enumerator.close(); } return proposals; } /** * Get names of all projects that are in the index * * @return list of indexed projects * @throws IOException */ public List<String> getIndexProjects() throws IOException { return getProposals("", Field.PROJ); } /** * Need to reset searcher when index has been updated * Otherwise changes won't appear in the results * It is recommended to do a warmup search after the searcher was reset * */ public void onIndexReset() { resetSearcher(); } private void resetSearcher() { if (indexSearcher != null) { try { indexSearcher.close(); } catch (IOException e) { InstaSearchPlugin.log(e); } finally { indexSearcher = null; } } } /** * Warmup search. Searches after this will be faster * * @throws Exception */ private void warmup() throws Exception { search(new SearchQuery("<warmup search>", 1)); } /** * Closes and re-opens the index for searching * Should be called after the index is updated * */ public void onIndexUpdate() { resetSearcher(); try { warmup(); } catch (Exception e) { InstaSearchPlugin.log(e); } } private Query parseSearchQuery(SearchQuery searchQuery, IndexReader reader, boolean exact, boolean prefix) throws ParseException, IOException { String searchString = searchQuery.getSearchString(); BooleanQuery.setMaxClauseCount(5000); // so we don't get TooManyClauses exceptions Query exactQuery = createExactQuery(searchQuery); Query returnQuery; if (exact) // want exact search, use KeywordAnalyzer { returnQuery = exactQuery; } else { Query query = parserSearchString(searchString, queryAnalyzer); exactQuery.setBoost(query.getBoost() * 2f); // exact query more important returnQuery = combineQueries(query, exactQuery); } returnQuery = rewriteQuery(searchQuery, prefix, returnQuery); debug("q: ", returnQuery, " - exact ", exact); returnQuery = returnQuery.rewrite(reader); // lucene's rewrite (ie expand prefix queries) return returnQuery; } /** * @param searchQuery * @return * @throws ParseException */ private Query createExactQuery(SearchQuery searchQuery) throws ParseException { Query query = parserSearchString(searchQuery.getSearchString(), new KeywordAnalyzer()); VisitableQuery visitableQuery = new VisitableQuery(query); visitableQuery.accept(uppercaseNameExpander); visitableQuery.accept(fileNameSearcher); query = visitableQuery.getQuery(); return query; } private Query rewriteQuery(SearchQuery searchQuery, boolean prefix, Query query) { VisitableQuery visitableQuery = new VisitableQuery(query); visitableQuery.accept(uppercaseNameExpander); visitableQuery.accept(lowercaseConverter); if (searchQuery.isFuzzy()) visitableQuery.accept(queryFuzzifier); visitableQuery.accept(csvExpander); visitableQuery.accept(fieldAliasConverter); visitableQuery.accept(modifiedTimeConverter); visitableQuery.accept(folderSearcher); if (prefix && !searchQuery.isFuzzy()) // prefix last term query for substring search while typing visitableQuery.accept(lastTermQueryPrefixer); if (searchQuery.getFilter() != null) { filterSetter.setFilter(searchQuery.getFilter()); visitableQuery.accept(filterSetter); } visitableQuery.accept(workingSetExpander); if (searchQuery.getCurrentProject() != null) { currentProjectSetter.setCurrentProject(searchQuery.getCurrentProject()); visitableQuery.accept(currentProjectSetter); } return visitableQuery.getQuery(); // the modified query after all visitors } private BooleanQuery combineQueries(Query query, Query exactQuery) { BooleanQuery topQuery = new BooleanQuery(); topQuery.add(exactQuery, Occur.SHOULD); topQuery.add(query, Occur.SHOULD); return topQuery; } private Query parserSearchString(String searchString, Analyzer analyzer) throws ParseException { QueryParser queryParser = new QueryParser(Field.CONTENTS.toString(), analyzer); queryParser.setDefaultOperator(Operator.AND); // all fields required queryParser.setLowercaseExpandedTerms(false); queryParser.setPhraseSlop(DEFAULT_PHRASE_SLOP); /* * Allow words in the query to begin with * * see http://lucene.apache.org/java/2_3_1/api/org/apache/lucene/queryParser/QueryParser.html#setAllowLeadingWildcard%28boolean%29 */ queryParser.setAllowLeadingWildcard(true); return queryParser.parse(searchString); } /** * Extracts search terms and their weights * @param query * @return */ private static Map<String, Float> extractTerms(Query query) { WeightedTerm[] weightedTerms = QueryTermExtractor.getTerms(query, false, Field.CONTENTS.toString()); Map<String, Float> terms = new HashMap<String, Float>(weightedTerms.length); for (WeightedTerm weightedTerm : weightedTerms) terms.put(weightedTerm.getTerm(), weightedTerm.getWeight()); return terms; } protected void initPrefs() { fuzzySearchAuto = InstaSearchPlugin.getBoolPref(PreferenceConstants.P_FUZZY_SEARCH_AUTO); showMatchCounts = InstaSearchPlugin.getBoolPref(PreferenceConstants.P_SHOW_MATCH_COUNT); } public void propertyChange(PropertyChangeEvent event) { String prop = event.getProperty(); if (PreferenceConstants.P_SHOW_MATCH_COUNT.equals(prop)) showMatchCounts = InstaSearchPlugin.getBoolPref(PreferenceConstants.P_SHOW_MATCH_COUNT); else if (PreferenceConstants.P_FUZZY_SEARCH_AUTO.equals(prop)) fuzzySearchAuto = InstaSearchPlugin.getBoolPref(PreferenceConstants.P_FUZZY_SEARCH_AUTO); } }