List of usage examples for org.apache.lucene.search IndexSearcher doc
public Document doc(int docID) throws IOException
.getIndexReader().document(docID)
From source file:com.flaptor.hounder.searcher.spell.SpellChecker.java
License:Apache License
/** * Suggest similar words (restricted or not to a field of a user index) * @param word String the word you want a spell check done on * @param num_sug int the number of suggest words * @param ir the indexReader of the user index (can be null see field param) * @param field String the field of the user index: if field is not null, the suggested * words are restricted to the words present in this field. * @param morePopular boolean return only the suggest words that are more frequent than the searched word * (only if restricted mode = (indexReader!=null and field!=null) * @throws IOException//ww w . j av a 2 s. c o m * @return String[] the sorted list of the suggest words with this 2 criteria: * first criteria: the edit distance, second criteria (only if restricted mode): the popularity * of the suggest words in the field of the user index */ public String[] suggestSimilar(String word, int num_sug, IndexReader ir, String field, boolean morePopular) throws IOException { float minScore = min; final TRStringDistance sd = new TRStringDistance(word); final int lengthWord = word.length(); final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0; if (!morePopular && goalFreq > 0) { return new String[] { word }; // return the word if it exist in the index and i don't want a more popular word } BooleanQuery query = new BooleanQuery(); String[] grams; String key; for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = formGrams(word, ng); // form word into ngrams (allow dups too) if (grams.length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.length; i++) { add(query, key, grams[i]); } } IndexSearcher searcher = new IndexSearcher(this.spellindex); TopDocCollector collector = new TopDocCollector(10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers searcher.search(query, collector); ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs; SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug); SuggestWord sugword = new SuggestWord(); for (int i = 0; i < scoreDocs.length; i++) { Document doc = searcher.doc(i); sugword.string = doc.get(F_WORD); // get orig word) if (sugword.string.equals(word)) { continue; // don't suggest a word for itself, that would be silly } //edit distance/normalize with the min word length sugword.score = doc.getBoost() * (1.0f - ((float) sd.getDistance(sugword.string) / Math.min(sugword.string.length(), lengthWord))); if (sugword.score < minScore) { continue; } if (ir != null) { // use the user index sugword.freq = ir.docFreq(new Term(field, sugword.string)); // freq in the index if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { // don't suggest a word that is not present in the field continue; } } sugqueue.insert(sugword); if (sugqueue.size() == num_sug) { //if queue full , maintain the min score minScore = ((SuggestWord) sugqueue.top()).score; } sugword = new SuggestWord(); } // convert to array string String[] list = new String[sugqueue.size()]; for (int i = sugqueue.size() - 1; i >= 0; i--) { list[i] = ((SuggestWord) sugqueue.pop()).string; } searcher.close(); return list; }
From source file:com.flaptor.hounder.util.Idx.java
License:Apache License
public static void main(String arg[]) throws Exception { check(arg.length > 1, null);// ww w . jav a2s. co m String cmd = arg[0]; File idx = new File(arg[1]); if ("list".equals(cmd)) { int num = (arg.length > 2) ? Integer.parseInt(arg[2]) : -1; check(idx.exists(), "Index dir not found"); IndexReader reader = IndexReader.open(idx); int docs = reader.numDocs(); int max = reader.maxDoc(); System.err.println("Index contains " + docs + " documents plus " + (max - docs) + " deleted."); if (num > -1) { if (num == 0) num = docs; for (int i = 0; i < max && i < num; i++) { System.out.println("----------------------------------------"); if (!reader.isDeleted(i)) { Document doc = reader.document(i); List flds = doc.getFields(); Iterator iter = flds.iterator(); while (iter.hasNext()) { Field fld = (Field) iter.next(); String attr = (fld.isIndexed() ? ",i" : "") + (fld.isStored() ? ",s" : "") + (fld.isTokenized() ? ",t" : ""); System.out.println(fld.name() + attr + ": " + fld.stringValue()); } } } reader.close(); System.out.println(); } } else if ("search".equals(cmd)) { check(idx.exists(), "Index dir not found"); check(arg.length > 3, "Not enough arguments"); String field = arg[2]; String value = arg[3]; IndexSearcher searcher = new IndexSearcher(IndexReader.open(idx)); ScorelessHitCollector collector = new HashSetScorelessHitCollector(); searcher.search(new TermQuery(new Term(field, value)), collector); Set<Integer> docIds = collector.getMatchingDocuments(); System.out.println("\nNumber of hits: " + docIds.size() + "\n"); for (Integer docId : docIds) { Document doc = searcher.doc(docId); List flds = doc.getFields(); Iterator iter = flds.iterator(); while (iter.hasNext()) { Field fld = (Field) iter.next(); System.out.println(fld.name() + ": " + fld.stringValue()); } } searcher.close(); System.out.println(); } else if ("delete".equals(cmd)) { check(idx.exists(), "Index dir not found"); check(arg.length > 3, "Not enough arguments"); String field = arg[2]; String value = arg[3]; IndexReader reader = IndexReader.open(idx); reader.deleteDocuments(new Term(field, value)); reader.close(); } else if ("optimize".equals(cmd)) { IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.optimize(); writer.close(); } else if ("merge".equals(cmd)) { check(arg.length == 3, "not enough parameters"); File idx2 = new File(arg[2]); check(idx.exists(), "Index dir 1 not found"); check(idx2.exists(), "Index dir 2 not found"); IndexReader reader = IndexReader.open(idx2); IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.addIndexes(new IndexReader[] { reader }); writer.close(); reader.close(); } else if ("term-count".equals(cmd)) { check(arg.length == 3, "not enough parameters"); check(idx.exists(), "Index dir not found"); IndexReader reader = IndexReader.open(idx); String field = arg[2]; int count = 0; TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); if (term.field().equals(field)) count++; } terms.close(); reader.close(); System.out.println("Found " + count + " different values for field " + field); } else if ("hit-count".equals(cmd)) { check(arg.length > 3, "Not enough arguments"); check(idx.exists(), "Index dir not found"); String field = arg[2]; String value = arg[3]; IndexSearcher searcher = new IndexSearcher(IndexReader.open(idx)); CountingHitCollector collector = new CountingHitCollector(); searcher.search(new TermQuery(new Term(field, value)), collector); System.out.println("\nNumber of hits: " + collector.getDocCount() + "\n"); searcher.close(); } else if ("uncompound".equals(cmd)) { IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.setUseCompoundFile(false); writer.optimize(); writer.close(); } else if ("compound".equals(cmd)) { IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.setUseCompoundFile(true); writer.optimize(); writer.close(); } else if ("terms".equals(cmd)) { check(arg.length == 3, "not enough parameters"); check(idx.exists(), "Index dir not found"); String field = arg[2]; IndexReader reader = IndexReader.open(idx); TermEnum terms = reader.terms(); while (terms.next()) { Term t = terms.term(); if (t.field().equals(field)) { System.out.println(t.text()); } } } }
From source file:com.gauronit.tagmata.core.Indexer.java
License:Open Source License
public ArrayList getIndexNames() { IndexSearcher mainIndexSearcher = null; IndexReader ir = null;/*from w w w .j av a 2 s . c o m*/ try { ir = IndexReader.open(FSDirectory.open(new File(indexDir + File.separator + MAIN_INDEX), new SimpleFSLockFactory(indexDir + File.separator + MAIN_INDEX))); mainIndexSearcher = new IndexSearcher(ir); ArrayList<String[]> indexNames = new ArrayList<String[]>(); mainIndexSearcher = new IndexSearcher(ir); Query q = new WildcardQuery(new Term("indexName", "*")); TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false); mainIndexSearcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (ScoreDoc hit : hits) { Document doc = mainIndexSearcher.doc(hit.doc); String indexName = doc.get("indexName"); String indexDisplayName = doc.get("displayName"); indexNames.add(new String[] { indexName, indexDisplayName }); } return indexNames; } catch (Exception ex) { ex.printStackTrace(); return null; } finally { try { ir.close(); mainIndexSearcher.close(); ir = null; mainIndexSearcher = null; } catch (IOException e) { logger.info("Error: Unable to close index."); System.exit(0); e.printStackTrace(); } } }
From source file:com.gauronit.tagmata.core.Indexer.java
License:Open Source License
public ArrayList<CardSnapshot> getBookmarks() { ArrayList<CardSnapshot> cardSnaps = new ArrayList(); try {/* ww w. j ava 2s. c o m*/ IndexReader ir = IndexReader.open(FSDirectory.open(new File(indexDir + File.separator + MAIN_INDEX), new SimpleFSLockFactory(indexDir + File.separator + MAIN_INDEX))); IndexSearcher mainIndexSearcher = new IndexSearcher(ir); Query q = new WildcardQuery(new Term("qcId", "*")); TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false); mainIndexSearcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (ScoreDoc hit : hits) { Document doc = mainIndexSearcher.doc(hit.doc); IndexReader reader = IndexReader .open(FSDirectory.open(new File(indexDir + File.separator + doc.get("qcIndexName")), new SimpleFSLockFactory(indexDir + File.separator + doc.get("qcIndexName")))); IndexSearcher searcher = new IndexSearcher(reader); q = new TermQuery(new Term("id", doc.get("qcId"))); collector = TopScoreDocCollector.create(10000, false); searcher.search(q, collector); ScoreDoc[] hits2 = collector.topDocs().scoreDocs; doc = searcher.doc(hits2[0].doc); cardSnaps.add(new CardSnapshot("", doc)); reader.close(); searcher.close(); reader = null; searcher = null; } ir.close(); mainIndexSearcher.close(); ir = null; mainIndexSearcher = null; } catch (Exception ex) { ex.printStackTrace(); } return cardSnaps; }
From source file:com.gauronit.tagmata.core.Indexer.java
License:Open Source License
public ArrayList<CardSnapshot> search(String searchText, ArrayList<String> indexNames, boolean searchInTitle, boolean searchInTags, boolean searchInText, boolean superFuzzy) { ArrayList<CardSnapshot> cardSnaps = new ArrayList(); try {//from w w w .j ava2s. com ArrayList<IndexSearcher> searchers = new ArrayList<IndexSearcher>(); for (String indexName : indexNames) { IndexReader reader = IndexReader .open(FSDirectory.open(new File(indexDir + File.separator + indexName), new SimpleFSLockFactory(indexDir + File.separator + indexName))); IndexSearcher searcher = new IndexSearcher(reader); searchers.add(searcher); } BooleanQuery query = new BooleanQuery(); if (searchInTitle) { IndexerUtil.getTokenizedQuery(query, "title", searchText, superFuzzy); } if (searchInTags) { IndexerUtil.getTokenizedQuery(query, "tags", searchText, superFuzzy); } if (searchInText) { IndexerUtil.getTokenizedQuery(query, "text", searchText, superFuzzy); IndexerUtil.getTokenizedQuery(query, "analyzedText", searchText, superFuzzy); } for (IndexSearcher searcher : searchers) { TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); TokenStream stream = TokenSources.getTokenStream("text", doc.get("analyzedText"), new StandardAnalyzer(Version.LUCENE_20.LUCENE_35)); QueryScorer scorer = new QueryScorer(query, "analyzedText"); Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 20); Highlighter highlighter = new Highlighter(scorer); highlighter.setTextFragmenter(fragmenter); String[] fragments = highlighter.getBestFragments(stream, doc.get("text"), 5); String highlights = ""; for (String fragment : fragments) { highlights += fragment + "..."; } if (highlights.equals("")) { String text = doc.get("text"); if (text.length() > 100) { highlights += doc.get("text").substring(0, 100); } else { highlights += doc.get("text"); } } cardSnaps.add(new CardSnapshot(highlights, doc)); } searcher.getIndexReader().close(); searcher.close(); searcher = null; } } catch (Exception ex) { ex.printStackTrace(); } return cardSnaps; }
From source file:com.gemstone.gemfire.cache.lucene.internal.repository.IndexRepositoryImpl.java
License:Apache License
@Override public void query(Query query, int limit, IndexResultCollector collector) throws IOException { IndexSearcher searcher = searcherManager.acquire(); try {/*w ww.j ava2 s.co m*/ TopDocs docs = searcher.search(query, limit); for (ScoreDoc scoreDoc : docs.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); Object key = SerializerUtil.getKey(doc); collector.collect(key, scoreDoc.score); } } finally { searcherManager.release(searcher); } }
From source file:com.gitblit.LuceneExecutor.java
License:Apache License
/** * Searches the specified repositories for the given text or query * /*w w w .j a va 2 s . co m*/ * @param text * if the text is null or empty, null is returned * @param page * the page number to retrieve. page is 1-indexed. * @param pageSize * the number of elements to return for this page * @param repositories * a list of repositories to search. if no repositories are * specified null is returned. * @return a list of SearchResults in order from highest to the lowest score * */ public List<SearchResult> search(String text, int page, int pageSize, String... repositories) { if (StringUtils.isEmpty(text)) { return null; } if (ArrayUtils.isEmpty(repositories)) { return null; } Set<SearchResult> results = new LinkedHashSet<SearchResult>(); StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION); try { // default search checks summary and content BooleanQuery query = new BooleanQuery(); QueryParser qp; qp = new QueryParser(LUCENE_VERSION, FIELD_SUMMARY, analyzer); qp.setAllowLeadingWildcard(true); query.add(qp.parse(text), Occur.SHOULD); qp = new QueryParser(LUCENE_VERSION, FIELD_CONTENT, analyzer); qp.setAllowLeadingWildcard(true); query.add(qp.parse(text), Occur.SHOULD); IndexSearcher searcher; if (repositories.length == 1) { // single repository search searcher = getIndexSearcher(repositories[0]); } else { // multiple repository search List<IndexReader> readers = new ArrayList<IndexReader>(); for (String repository : repositories) { IndexSearcher repositoryIndex = getIndexSearcher(repository); readers.add(repositoryIndex.getIndexReader()); } IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]); MultiSourceReader reader = new MultiSourceReader(rdrs); searcher = new IndexSearcher(reader); } Query rewrittenQuery = searcher.rewrite(query); logger.debug(rewrittenQuery.toString()); TopScoreDocCollector collector = TopScoreDocCollector.create(5000, true); searcher.search(rewrittenQuery, collector); int offset = Math.max(0, (page - 1) * pageSize); ScoreDoc[] hits = collector.topDocs(offset, pageSize).scoreDocs; int totalHits = collector.getTotalHits(); for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); SearchResult result = createSearchResult(doc, hits[i].score, offset + i + 1, totalHits); if (repositories.length == 1) { // single repository search result.repository = repositories[0]; } else { // multi-repository search MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader(); int index = reader.getSourceIndex(docId); result.repository = repositories[index]; } String content = doc.get(FIELD_CONTENT); result.fragment = getHighlightedFragment(analyzer, query, content, result); results.add(result); } } catch (Exception e) { logger.error(MessageFormat.format("Exception while searching for {0}", text), e); } return new ArrayList<SearchResult>(results); }
From source file:com.gitblit.service.LuceneService.java
License:Apache License
/** * Searches the specified repositories for the given text or query * * @param text// w w w . ja va2 s .co m * if the text is null or empty, null is returned * @param page * the page number to retrieve. page is 1-indexed. * @param pageSize * the number of elements to return for this page * @param repositories * a list of repositories to search. if no repositories are * specified null is returned. * @return a list of SearchResults in order from highest to the lowest score * */ public List<SearchResult> search(String text, int page, int pageSize, String... repositories) { if (StringUtils.isEmpty(text)) { return null; } if (ArrayUtils.isEmpty(repositories)) { return null; } Set<SearchResult> results = new LinkedHashSet<SearchResult>(); StandardAnalyzer analyzer = new StandardAnalyzer(); try { // default search checks summary and content BooleanQuery.Builder bldr = new BooleanQuery.Builder(); QueryParser qp; qp = new QueryParser(FIELD_SUMMARY, analyzer); qp.setAllowLeadingWildcard(true); bldr.add(qp.parse(text), Occur.SHOULD); qp = new QueryParser(FIELD_CONTENT, analyzer); qp.setAllowLeadingWildcard(true); bldr.add(qp.parse(text), Occur.SHOULD); IndexSearcher searcher; if (repositories.length == 1) { // single repository search searcher = getIndexSearcher(repositories[0]); } else { // multiple repository search List<IndexReader> readers = new ArrayList<IndexReader>(); for (String repository : repositories) { IndexSearcher repositoryIndex = getIndexSearcher(repository); readers.add(repositoryIndex.getIndexReader()); } IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]); MultiSourceReader reader = new MultiSourceReader(rdrs); searcher = new IndexSearcher(reader); } BooleanQuery query = bldr.build(); Query rewrittenQuery = searcher.rewrite(query); logger.debug(rewrittenQuery.toString()); TopScoreDocCollector collector = TopScoreDocCollector.create(5000); searcher.search(rewrittenQuery, collector); int offset = Math.max(0, (page - 1) * pageSize); ScoreDoc[] hits = collector.topDocs(offset, pageSize).scoreDocs; int totalHits = collector.getTotalHits(); for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); SearchResult result = createSearchResult(doc, hits[i].score, offset + i + 1, totalHits); if (repositories.length == 1) { // single repository search result.repository = repositories[0]; } else { // multi-repository search MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader(); int index = reader.getSourceIndex(docId); result.repository = repositories[index]; } String content = doc.get(FIELD_CONTENT); result.fragment = getHighlightedFragment(analyzer, query, content, result); results.add(result); } } catch (Exception e) { logger.error(MessageFormat.format("Exception while searching for {0}", text), e); } return new ArrayList<SearchResult>(results); }
From source file:com.gitblit.tickets.TicketIndexer.java
License:Apache License
/** * Search for tickets matching the query. The returned tickets are * shadows of the real ticket, but suitable for a results list. * * @param repository//from ww w . j av a2 s . c om * @param text * @param page * @param pageSize * @return search results */ public List<QueryResult> searchFor(RepositoryModel repository, String text, int page, int pageSize) { if (StringUtils.isEmpty(text)) { return Collections.emptyList(); } Set<QueryResult> results = new LinkedHashSet<QueryResult>(); StandardAnalyzer analyzer = new StandardAnalyzer(); try { // search the title, description and content BooleanQuery.Builder bldr = new BooleanQuery.Builder(); QueryParser qp; qp = new QueryParser(Lucene.title.name(), analyzer); qp.setAllowLeadingWildcard(true); bldr.add(qp.parse(text), Occur.SHOULD); qp = new QueryParser(Lucene.body.name(), analyzer); qp.setAllowLeadingWildcard(true); bldr.add(qp.parse(text), Occur.SHOULD); qp = new QueryParser(Lucene.content.name(), analyzer); qp.setAllowLeadingWildcard(true); bldr.add(qp.parse(text), Occur.SHOULD); IndexSearcher searcher = getSearcher(); Query rewrittenQuery = searcher.rewrite(bldr.build()); log.debug(rewrittenQuery.toString()); TopScoreDocCollector collector = TopScoreDocCollector.create(5000); searcher.search(rewrittenQuery, collector); int offset = Math.max(0, (page - 1) * pageSize); ScoreDoc[] hits = collector.topDocs(offset, pageSize).scoreDocs; for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); QueryResult result = docToQueryResult(doc); if (repository != null) { if (!result.repository.equalsIgnoreCase(repository.name)) { continue; } } results.add(result); } } catch (Exception e) { log.error(MessageFormat.format("Exception while searching for {0}", text), e); } return new ArrayList<QueryResult>(results); }
From source file:com.gitblit.tickets.TicketIndexer.java
License:Apache License
/** * Search for tickets matching the query. The returned tickets are * shadows of the real ticket, but suitable for a results list. * * @param text/*from w ww. j a v a2 s. c o m*/ * @param page * @param pageSize * @param sortBy * @param desc * @return */ public List<QueryResult> queryFor(String queryText, int page, int pageSize, String sortBy, boolean desc) { if (StringUtils.isEmpty(queryText)) { return Collections.emptyList(); } Set<QueryResult> results = new LinkedHashSet<QueryResult>(); StandardAnalyzer analyzer = new StandardAnalyzer(); try { QueryParser qp = new QueryParser(Lucene.content.name(), analyzer); Query query = qp.parse(queryText); IndexSearcher searcher = getSearcher(); Query rewrittenQuery = searcher.rewrite(query); log.debug(rewrittenQuery.toString()); Sort sort; if (sortBy == null) { sort = new Sort(Lucene.created.asSortField(desc)); } else { sort = new Sort(Lucene.fromString(sortBy).asSortField(desc)); } int maxSize = 5000; TopFieldDocs docs = searcher.search(rewrittenQuery, maxSize, sort, false, false); int size = (pageSize <= 0) ? maxSize : pageSize; int offset = Math.max(0, (page - 1) * size); ScoreDoc[] hits = subset(docs.scoreDocs, offset, size); for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); QueryResult result = docToQueryResult(doc); result.docId = docId; result.totalResults = docs.totalHits; results.add(result); } } catch (Exception e) { log.error(MessageFormat.format("Exception while searching for {0}", queryText), e); } return new ArrayList<QueryResult>(results); }