List of usage examples for org.apache.lucene.search IndexSearcher getIndexReader
public IndexReader getIndexReader()
From source file:invertedindex.SearchIndex.java
public ArrayList<SearchResults> multipleSearch(String keyword1, String keyword2, String radio) throws IOException { String indexLocation = this.getIndexLocation(); try {//from w ww. jav a2s . co m IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(topDocs, true); String query1 = keyword1; String query2 = keyword2; query1 = "\"" + query1 + "\""; query2 = "\"" + query2 + "\""; Query q1 = new QueryParser(Version.LUCENE_47, "contents", analyzer).parse(query1); Query q2 = new QueryParser(Version.LUCENE_47, "contents", analyzer).parse(query2); BooleanQuery apiQuery = new BooleanQuery(); if (radio.equalsIgnoreCase("and")) { apiQuery.add(q1, BooleanClause.Occur.MUST); apiQuery.add(q2, BooleanClause.Occur.MUST); } else if (radio.equalsIgnoreCase("or")) { apiQuery.add(q1, BooleanClause.Occur.SHOULD); apiQuery.add(q2, BooleanClause.Occur.SHOULD); } else if (radio.equalsIgnoreCase("not")) { apiQuery.add(q1, BooleanClause.Occur.MUST); apiQuery.add(q2, BooleanClause.Occur.MUST_NOT); } SimpleFragListBuilder fragListBuilder = new SimpleFragListBuilder(); ScoreOrderFragmentsBuilder fragBuilder = new ScoreOrderFragmentsBuilder(); FastVectorHighlighter fvh = new FastVectorHighlighter(FastVectorHighlighter.DEFAULT_PHRASE_HIGHLIGHT, FastVectorHighlighter.DEFAULT_FIELD_MATCH, fragListBuilder, fragBuilder); fvh = new FastVectorHighlighter(FastVectorHighlighter.DEFAULT_PHRASE_HIGHLIGHT, FastVectorHighlighter.DEFAULT_FIELD_MATCH, fragListBuilder, fragBuilder); searcher.search(apiQuery, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println("Found " + hits.length + " hits."); totalHits = hits.length; searchResulsAL = new ArrayList<>(); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; FieldQuery fq = fvh.getFieldQuery(apiQuery); // String[] fragments = fvh.getBestFragments(fq, searcher.getIndexReader(), docId, "contents", 50, 10); Document d = searcher.doc(docId); // String filePath = d.get("path"); for (int j = 0; j < fragments.length; j++) { String temp = Jsoup.parse(fragments[j]).text(); // LineNumberSearcher lns = new LineNumberSearcher(); //lineNumbersList = new ArrayList<>(); lineNumber = "null"; lineNumberArrayList = new ArrayList<>(); boolean g = Pattern.compile("\\n").matcher(fragments[j]).find(); if (!g) { // System.out.println("NO G g"); lineNumbersList = lns.search(temp, filePath); // for(String s : lineNumbersList){ // System.out.println("s is "+s); // } // if (!lineNumbersList.isEmpty()) { // System.out.println("in line number"); lineNumber = lineNumbersList.get(0); } } fragments[j] = fragments[j].replaceAll("\\n", " "); // System.out.println("\t\t" + fragments[j] + "..."); fragments[j] = fragments[j] + " ...."; if (!(lineNumber.equals("null"))) { // System.out.println("in line number"); fragments[j] = fragments[j] + " at Line " + lineNumber; } } SearchResults sr = new SearchResults(); sr.setFilename(d.get("filename")); sr.setScore(hits[i].score); sr.setFragments(fragments); sr.setPath(filePath); sr.setContentType(d.get("contentType")); searchResulsAL.add(sr); } reader.close(); } catch (Exception e) { System.out.println("Error searching in search index " + e + " : " + e.getMessage()); } return searchResulsAL; }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Select {@code R*N} docs from the ranking results and the index as the reranking pool. * The process is://w w w . j a va2 s . c o m * 1. Keep the top R documents in the original ranking list * 2. Randomly pick {@code (N-1)*R} documents from the rest of the index so in total we have R*M documents * * @param docs The initial ranking results * @param context An instance of RerankerContext * @return a Set of {@code R*N} document Ids */ private Set<Integer> selectDocs(ScoredDocuments docs, RerankerContext<T> context) throws IOException { Set<Integer> docidSet = new HashSet<>(Arrays .asList(ArrayUtils.toObject(Arrays.copyOfRange(docs.ids, 0, Math.min(this.R, docs.ids.length))))); long targetSize = this.R * this.N; if (docidSet.size() < targetSize) { IndexReader reader; IndexSearcher searcher; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); searcher = new IndexSearcher(reader); } else { searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } int availableDocsCnt = reader.getDocCount(this.field); if (this.deterministic) { // internal docid cannot be relied due to multi-threads indexing, // we have to rely on external docid here Random random = new Random(this.seed); while (docidSet.size() < targetSize) { if (this.externalDocidsCache != null) { String docid = this.externalDocidsCache .get(random.nextInt(this.externalDocidsCache.size())); Query q = new TermQuery(new Term(LuceneDocumentGenerator.FIELD_ID, docid)); TopDocs rs = searcher.search(q, 1); docidSet.add(rs.scoreDocs[0].doc); } else { docidSet.add(this.internalDocidsCache[random.nextInt(this.internalDocidsCache.length)].doc); } } } else { Random random = new Random(); while (docidSet.size() < targetSize) { docidSet.add(random.nextInt(availableDocsCnt)); } } } return docidSet; }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Extract ALL the terms from the documents pool. * * @param docIds The reranking pool, see {@link #selectDocs} for explanations * @param context An instance of RerankerContext * @param filterPattern A Regex pattern that terms are collected only they matches the pattern, could be null * @return A Map of <term -> Set<docId>> kind of a small inverted list where the Set of docIds is where the term occurs *//*from w ww. j a v a 2s.c o m*/ private Map<String, Set<Integer>> extractTerms(Set<Integer> docIds, RerankerContext<T> context, Pattern filterPattern) throws Exception, IOException { IndexReader reader; IndexSearcher searcher; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); searcher = new IndexSearcher(reader); } else { searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } Map<String, Set<Integer>> termDocidSets = new HashMap<>(); for (int docid : docIds) { Terms terms = reader.getTermVector(docid, LuceneDocumentGenerator.FIELD_BODY); if (terms == null) { LOG.warn("Document vector not stored for docid: " + docid); continue; } TermsEnum te = terms.iterator(); if (te == null) { LOG.warn("Document vector not stored for docid: " + docid); continue; } while ((te.next()) != null) { String term = te.term().utf8ToString(); // We do some noisy filtering here ... pure empirical heuristic if (term.length() < 2) continue; if (!term.matches("[a-z]+")) continue; if (filterPattern == null || filterPattern.matcher(term).matches()) { if (!termDocidSets.containsKey(term)) { termDocidSets.put(term, new HashSet<>()); } termDocidSets.get(term).add(docid); } } } return termDocidSets; }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Calculate the scores (weights) of each term that occured in the reranking pool. * The Process:/*from w w w .j av a 2 s. c o m*/ * 1. For each query term, calculate its score for each term in the reranking pool. the score * is calcuated as * <pre> * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)} * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)} * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)} * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)} * </pre> * 2. For each query term the scores of every other term in the reranking pool are stored in a * PriorityQueue, only the top {@code K} are kept. * 3. Add the scores of the same term together and pick the top {@code M} ones. * * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs * @param context An instance of RerankerContext * @return Map<String, Double> Top terms and their weight scores in a HashMap */ private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList, RerankerContext<T> context) throws IOException { class ScoreComparator implements Comparator<Pair<String, Double>> { public int compare(Pair<String, Double> a, Pair<String, Double> b) { int cmp = Double.compare(b.getRight(), a.getRight()); if (cmp == 0) { return a.getLeft().compareToIgnoreCase(b.getLeft()); } else { return cmp; } } } // get collection statistics so that we can get idf later on. IndexReader reader; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); } else { IndexSearcher searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs(); //calculate the Mutual Information between term with each query term List<String> queryTerms = context.getQueryTokens(); Map<String, Integer> queryTermsCounts = new HashMap<>(); for (String qt : queryTerms) { queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1); } Set<Integer> allDocIds = new HashSet<>(); for (Set<Integer> s : termInvertedList.values()) { allDocIds.addAll(s); } int docIdsCount = allDocIds.size(); // Each priority queue corresponds to a query term: The p-queue itself stores all terms // in the reranking pool and their reranking scores to the query term. List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>(); for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) { String queryTerm = q.getKey(); long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm)); if (df == 0L) { continue; } float idf = (float) Math.log((1 + docCount) / df); int qtf = q.getValue(); if (termInvertedList.containsKey(queryTerm)) { PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator()); double selfMI = computeMutualInformation(termInvertedList.get(queryTerm), termInvertedList.get(queryTerm), docIdsCount); for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) { double score; if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1 score = idf * qtf; } else { double crossMI = computeMutualInformation(termInvertedList.get(queryTerm), termEntry.getValue(), docIdsCount); score = idf * beta * qtf * crossMI / selfMI; } termScorePQ.add(Pair.of(termEntry.getKey(), score)); } allTermScoresPQ.add(termScorePQ); } } Map<String, Double> aggTermScores = new HashMap<>(); for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) { for (int i = 0; i < Math.min(termScores.size(), this.K); i++) { Pair<String, Double> termScore = termScores.poll(); String term = termScore.getLeft(); Double score = termScore.getRight(); if (score - 0.0 > 1e-8) { aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score); } } } PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator()); for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) { termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size())); } Map<String, Double> resultTermScores = new HashMap<>(); for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) { Pair<String, Double> termScore = termScoresPQ.poll(); String term = termScore.getKey(); double score = termScore.getValue(); resultTermScores.put(term, score); } return resultTermScores; }
From source file:io.anserini.rerank.lib.Rm3Reranker.java
License:Apache License
@Override public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) { assert (docs.documents.length == docs.scores.length); IndexSearcher searcher = context.getIndexSearcher(); IndexReader reader = searcher.getIndexReader(); FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())) .scaleToUnitL1Norm();/* w w w. ja va2 s. co m*/ FeatureVector rm = estimateRelevanceModel(docs, reader, context.getSearchArgs().searchtweets); rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight); StringBuilder builder = new StringBuilder(); Iterator<String> terms = rm.iterator(); while (terms.hasNext()) { String term = terms.next(); double prob = rm.getFeatureWeight(term); builder.append(term + "^" + prob + " "); } String queryText = builder.toString().trim(); QueryParser p = new QueryParser(field, new WhitespaceAnalyzer()); Query feedbackQuery; try { feedbackQuery = p.parse(queryText); } catch (ParseException e) { e.printStackTrace(); return docs; } if (this.outputQuery) { LOG.info("QID: " + context.getQueryId()); LOG.info("Original Query: " + context.getQuery().toString(this.field)); LOG.info("Running new query: " + feedbackQuery.toString(this.field)); } TopDocs rs; try { Query finalQuery = feedbackQuery; // If there's a filter condition, we need to add in the constraint. // Otherwise, just use the feedback query. if (context.getFilter() != null) { BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER); bqBuilder.add(feedbackQuery, BooleanClause.Occur.MUST); finalQuery = bqBuilder.build(); } // Figure out how to break the scoring ties. if (context.getSearchArgs().arbitraryScoreTieBreak) { rs = searcher.search(finalQuery, context.getSearchArgs().hits); } else if (context.getSearchArgs().searchtweets) { rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true, true); } else { rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true, true); } } catch (IOException e) { e.printStackTrace(); return docs; } return ScoredDocuments.fromTopDocs(rs, searcher); }
From source file:io.crate.execution.engine.collect.collectors.LuceneOrderedDocCollector.java
License:Apache License
public LuceneOrderedDocCollector(ShardId shardId, IndexSearcher searcher, Query query, Float minScore, boolean doDocsScores, int batchSize, CollectorContext collectorContext, Function<FieldDoc, Query> searchAfterQueryOptimize, Sort sort, List<? extends Input<?>> inputs, Collection<? extends LuceneCollectorExpression<?>> expressions) { super(shardId); this.searcher = searcher; this.query = query; this.minScore = minScore; this.doDocsScores = doDocsScores; this.batchSize = batchSize; this.collectorContext = collectorContext; this.searchAfterQueryOptimize = searchAfterQueryOptimize; this.sort = sort; this.scorer = new DummyScorer(); this.expressions = expressions; this.rowFunction = new ScoreDocRowFunction(searcher.getIndexReader(), inputs, expressions, scorer); }
From source file:io.crate.operation.collect.collectors.LuceneOrderedDocCollector.java
License:Apache License
public LuceneOrderedDocCollector(ShardId shardId, IndexSearcher searcher, Query query, Float minScore, boolean doDocsScores, int batchSize, FieldTypeLookup fieldTypeLookup, CollectorContext collectorContext, OrderBy orderBy, Sort sort, List<? extends Input<?>> inputs, Collection<? extends LuceneCollectorExpression<?>> expressions) { super(shardId); this.searcher = searcher; this.query = query; this.minScore = minScore; this.doDocsScores = doDocsScores; this.batchSize = batchSize; this.fieldTypeLookup = fieldTypeLookup; this.collectorContext = collectorContext; this.orderBy = orderBy; this.sort = sort; this.scorer = new DummyScorer(); this.expressions = expressions; this.rowFunction = new ScoreDocRowFunction(searcher.getIndexReader(), inputs, expressions, scorer); missingValues = new Object[orderBy.orderBySymbols().size()]; for (int i = 0; i < orderBy.orderBySymbols().size(); i++) { missingValues[i] = LuceneMissingValue.missingValue(orderBy, i); }//from w ww. j ava 2 s .c om }
From source file:io.github.msurdi.redeye.core.lucene.AbstractIndex.java
License:Apache License
/** * Retrieve a list of documents matching given query. The query must be a valid lucene query or '*' * for matching all documents. If the query is not valid, a best effort search is done. * * @param q a query string//w w w . j av a 2 s .c o m * @return a list of the {@link io.github.msurdi.redeye.api.Indexable} documents matching. * @throws IOException */ @Override public List<T> query(String q) throws IOException { ensureOpened(); ArrayList<T> results = Lists.newArrayList(); Query query; try { if (MATCH_ALL.equals(q)) { query = new MatchAllDocsQuery(); } else { query = new QueryParser(LUCENE_VERSION, DEFAULT_FIELD, analyzer).parse(q); } } catch (ParseException e) { query = new SimpleQueryParser(analyzer, DEFAULT_FIELD).parse(q); } IndexSearcher searcher = null; try { searcherManager.maybeRefresh(); searcher = searcherManager.acquire(); TopDocs docs = searcher.search(query, Math.max(1, searcher.getIndexReader().maxDoc())); for (ScoreDoc scoreDoc : docs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); results.add(buildEntity(document)); } } finally { searcherManager.release(searcher); } return results; }
From source file:io.github.msurdi.redeye.core.lucene.AbstractIndex.java
License:Apache License
/** * Get the number of documents in the lucene * * @return the number of documents in the lucene * @throws IOException//from w ww .j av a 2 s . c o m */ @Override public long getCount() throws IOException { ensureOpened(); IndexSearcher searcher = null; try { searcherManager.maybeRefresh(); searcher = searcherManager.acquire(); return searcher.getIndexReader().maxDoc(); } finally { searcherManager.release(searcher); } }
From source file:io.puntanegra.fhir.index.lucene.LuceneRAMIndex.java
License:Apache License
/** * Finds the top {@code count} hits for {@code query} and sorting the hits * by {@code sort}.//from ww w. j a va2 s . c om * * @param query * the {@link Query} to search for * @param sort * the {@link Sort} to be applied * @param count * the max number of results to be collected * @param fields * the names of the fields to be loaded * @return the found documents */ public List<Document> search(Query query, Sort sort, Integer count, Set<String> fields) { try { indexWriter.commit(); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); sort = sort.rewrite(searcher); TopDocs topDocs = searcher.search(query, count, sort); ScoreDoc[] scoreDocs = topDocs.scoreDocs; List<Document> documents = new LinkedList<>(); for (ScoreDoc scoreDoc : scoreDocs) { Document document = searcher.doc(scoreDoc.doc, fields); documents.add(document); } searcher.getIndexReader().close(); return documents; } catch (IOException e) { throw new FhirIndexException(e, "Error while searching"); } }