Example usage for org.apache.lucene.search IndexSearcher getIndexReader

List of usage examples for org.apache.lucene.search IndexSearcher getIndexReader

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher getIndexReader.

Prototype

public IndexReader getIndexReader() 

Source Link

Document

Return the IndexReader this searches.

Usage

From source file:invertedindex.SearchIndex.java

public ArrayList<SearchResults> multipleSearch(String keyword1, String keyword2, String radio)
        throws IOException {

    String indexLocation = this.getIndexLocation();

    try {//from w  ww. jav  a2s  .  co m
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
        IndexSearcher searcher = new IndexSearcher(reader);
        TopScoreDocCollector collector = TopScoreDocCollector.create(topDocs, true);

        String query1 = keyword1;
        String query2 = keyword2;
        query1 = "\"" + query1 + "\"";
        query2 = "\"" + query2 + "\"";

        Query q1 = new QueryParser(Version.LUCENE_47, "contents", analyzer).parse(query1);
        Query q2 = new QueryParser(Version.LUCENE_47, "contents", analyzer).parse(query2);

        BooleanQuery apiQuery = new BooleanQuery();
        if (radio.equalsIgnoreCase("and")) {
            apiQuery.add(q1, BooleanClause.Occur.MUST);
            apiQuery.add(q2, BooleanClause.Occur.MUST);
        } else if (radio.equalsIgnoreCase("or")) {
            apiQuery.add(q1, BooleanClause.Occur.SHOULD);
            apiQuery.add(q2, BooleanClause.Occur.SHOULD);
        } else if (radio.equalsIgnoreCase("not")) {
            apiQuery.add(q1, BooleanClause.Occur.MUST);
            apiQuery.add(q2, BooleanClause.Occur.MUST_NOT);
        }

        SimpleFragListBuilder fragListBuilder = new SimpleFragListBuilder();
        ScoreOrderFragmentsBuilder fragBuilder = new ScoreOrderFragmentsBuilder();
        FastVectorHighlighter fvh = new FastVectorHighlighter(FastVectorHighlighter.DEFAULT_PHRASE_HIGHLIGHT,
                FastVectorHighlighter.DEFAULT_FIELD_MATCH, fragListBuilder, fragBuilder);
        fvh = new FastVectorHighlighter(FastVectorHighlighter.DEFAULT_PHRASE_HIGHLIGHT,
                FastVectorHighlighter.DEFAULT_FIELD_MATCH, fragListBuilder, fragBuilder);

        searcher.search(apiQuery, collector);

        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        System.out.println("Found " + hits.length + " hits.");
        totalHits = hits.length;
        searchResulsAL = new ArrayList<>();

        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            FieldQuery fq = fvh.getFieldQuery(apiQuery);
            //                    

            String[] fragments = fvh.getBestFragments(fq, searcher.getIndexReader(), docId, "contents", 50, 10);

            Document d = searcher.doc(docId);
            //                    
            String filePath = d.get("path");

            for (int j = 0; j < fragments.length; j++) {

                String temp = Jsoup.parse(fragments[j]).text();
                //                                 
                LineNumberSearcher lns = new LineNumberSearcher();

                //lineNumbersList = new ArrayList<>();
                lineNumber = "null";
                lineNumberArrayList = new ArrayList<>();
                boolean g = Pattern.compile("\\n").matcher(fragments[j]).find();
                if (!g) {
                    //                                        System.out.println("NO G g");
                    lineNumbersList = lns.search(temp, filePath);
                    //                                        for(String s : lineNumbersList){
                    //                                            System.out.println("s is "+s);
                    //                                        }
                    //                                      
                    if (!lineNumbersList.isEmpty()) {
                        //                                                System.out.println("in line number");
                        lineNumber = lineNumbersList.get(0);
                    }

                }

                fragments[j] = fragments[j].replaceAll("\\n", " ");
                //                                System.out.println("\t\t" + fragments[j] + "...");
                fragments[j] = fragments[j] + " ....";
                if (!(lineNumber.equals("null"))) {
                    //                                    System.out.println("in line number");
                    fragments[j] = fragments[j] + " at Line " + lineNumber;
                }

            }

            SearchResults sr = new SearchResults();
            sr.setFilename(d.get("filename"));
            sr.setScore(hits[i].score);
            sr.setFragments(fragments);
            sr.setPath(filePath);
            sr.setContentType(d.get("contentType"));

            searchResulsAL.add(sr);

        }

        reader.close();

    } catch (Exception e) {
        System.out.println("Error searching in search index " + e + " : " + e.getMessage());

    }

    return searchResulsAL;

}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Select {@code R*N} docs from the ranking results and the index as the reranking pool.
 * The process is://w  w w  .  j a  va2  s .  c  o  m
 * 1. Keep the top R documents in the original ranking list
 * 2. Randomly pick {@code (N-1)*R} documents from the rest of the index so in total we have R*M documents
 *
 * @param docs The initial ranking results
 * @param context An instance of RerankerContext
 * @return a Set of {@code R*N} document Ids
 */
private Set<Integer> selectDocs(ScoredDocuments docs, RerankerContext<T> context) throws IOException {
    Set<Integer> docidSet = new HashSet<>(Arrays
            .asList(ArrayUtils.toObject(Arrays.copyOfRange(docs.ids, 0, Math.min(this.R, docs.ids.length)))));
    long targetSize = this.R * this.N;

    if (docidSet.size() < targetSize) {
        IndexReader reader;
        IndexSearcher searcher;
        if (this.externalIndexPath != null) {
            Path indexPath = Paths.get(this.externalIndexPath);
            if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
                throw new IllegalArgumentException(
                        this.externalIndexPath + " does not exist or is not a directory.");
            }
            reader = DirectoryReader.open(FSDirectory.open(indexPath));
            searcher = new IndexSearcher(reader);
        } else {
            searcher = context.getIndexSearcher();
            reader = searcher.getIndexReader();
        }
        int availableDocsCnt = reader.getDocCount(this.field);
        if (this.deterministic) { // internal docid cannot be relied due to multi-threads indexing,
                                  // we have to rely on external docid here
            Random random = new Random(this.seed);
            while (docidSet.size() < targetSize) {
                if (this.externalDocidsCache != null) {
                    String docid = this.externalDocidsCache
                            .get(random.nextInt(this.externalDocidsCache.size()));
                    Query q = new TermQuery(new Term(LuceneDocumentGenerator.FIELD_ID, docid));
                    TopDocs rs = searcher.search(q, 1);
                    docidSet.add(rs.scoreDocs[0].doc);
                } else {
                    docidSet.add(this.internalDocidsCache[random.nextInt(this.internalDocidsCache.length)].doc);
                }
            }
        } else {
            Random random = new Random();
            while (docidSet.size() < targetSize) {
                docidSet.add(random.nextInt(availableDocsCnt));
            }
        }
    }

    return docidSet;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Extract ALL the terms from the documents pool.
 *
 * @param docIds The reranking pool, see {@link #selectDocs} for explanations
 * @param context An instance of RerankerContext
 * @param filterPattern A Regex pattern that terms are collected only they matches the pattern, could be null
 * @return A Map of <term -> Set<docId>> kind of a small inverted list where the Set of docIds is where the term occurs
 *//*from  w ww. j a  v  a 2s.c o  m*/
private Map<String, Set<Integer>> extractTerms(Set<Integer> docIds, RerankerContext<T> context,
        Pattern filterPattern) throws Exception, IOException {
    IndexReader reader;
    IndexSearcher searcher;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
        searcher = new IndexSearcher(reader);
    } else {
        searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    Map<String, Set<Integer>> termDocidSets = new HashMap<>();
    for (int docid : docIds) {
        Terms terms = reader.getTermVector(docid, LuceneDocumentGenerator.FIELD_BODY);
        if (terms == null) {
            LOG.warn("Document vector not stored for docid: " + docid);
            continue;
        }
        TermsEnum te = terms.iterator();
        if (te == null) {
            LOG.warn("Document vector not stored for docid: " + docid);
            continue;
        }
        while ((te.next()) != null) {
            String term = te.term().utf8ToString();
            // We do some noisy filtering here ... pure empirical heuristic
            if (term.length() < 2)
                continue;
            if (!term.matches("[a-z]+"))
                continue;
            if (filterPattern == null || filterPattern.matcher(term).matches()) {
                if (!termDocidSets.containsKey(term)) {
                    termDocidSets.put(term, new HashSet<>());
                }
                termDocidSets.get(term).add(docid);
            }
        }
    }
    return termDocidSets;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Calculate the scores (weights) of each term that occured in the reranking pool.
 * The Process:/*from   w  w w  .j av  a  2 s. c  o m*/
 * 1. For each query term, calculate its score for each term in the reranking pool. the score
 * is calcuated as
 * <pre>
 * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)}
 * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)}
 * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)}
 * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)}
 * </pre>
 * 2. For each query term the scores of every other term in the reranking pool are stored in a
 * PriorityQueue, only the top {@code K} are kept.
 * 3. Add the scores of the same term together and pick the top {@code M} ones.
 *
 * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs
 * @param context An instance of RerankerContext
 * @return Map<String, Double> Top terms and their weight scores in a HashMap
 */
private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList,
        RerankerContext<T> context) throws IOException {
    class ScoreComparator implements Comparator<Pair<String, Double>> {
        public int compare(Pair<String, Double> a, Pair<String, Double> b) {
            int cmp = Double.compare(b.getRight(), a.getRight());
            if (cmp == 0) {
                return a.getLeft().compareToIgnoreCase(b.getLeft());
            } else {
                return cmp;
            }
        }
    }

    // get collection statistics so that we can get idf later on.
    IndexReader reader;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
    } else {
        IndexSearcher searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs();

    //calculate the Mutual Information between term with each query term
    List<String> queryTerms = context.getQueryTokens();
    Map<String, Integer> queryTermsCounts = new HashMap<>();
    for (String qt : queryTerms) {
        queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1);
    }

    Set<Integer> allDocIds = new HashSet<>();
    for (Set<Integer> s : termInvertedList.values()) {
        allDocIds.addAll(s);
    }
    int docIdsCount = allDocIds.size();

    // Each priority queue corresponds to a query term: The p-queue itself stores all terms
    // in the reranking pool and their reranking scores to the query term.
    List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>();
    for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) {
        String queryTerm = q.getKey();
        long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm));
        if (df == 0L) {
            continue;
        }
        float idf = (float) Math.log((1 + docCount) / df);
        int qtf = q.getValue();
        if (termInvertedList.containsKey(queryTerm)) {
            PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator());
            double selfMI = computeMutualInformation(termInvertedList.get(queryTerm),
                    termInvertedList.get(queryTerm), docIdsCount);
            for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) {
                double score;
                if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1
                    score = idf * qtf;
                } else {
                    double crossMI = computeMutualInformation(termInvertedList.get(queryTerm),
                            termEntry.getValue(), docIdsCount);
                    score = idf * beta * qtf * crossMI / selfMI;
                }
                termScorePQ.add(Pair.of(termEntry.getKey(), score));
            }
            allTermScoresPQ.add(termScorePQ);
        }
    }

    Map<String, Double> aggTermScores = new HashMap<>();
    for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) {
        for (int i = 0; i < Math.min(termScores.size(), this.K); i++) {
            Pair<String, Double> termScore = termScores.poll();
            String term = termScore.getLeft();
            Double score = termScore.getRight();
            if (score - 0.0 > 1e-8) {
                aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score);
            }
        }
    }
    PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator());
    for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) {
        termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size()));
    }
    Map<String, Double> resultTermScores = new HashMap<>();
    for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) {
        Pair<String, Double> termScore = termScoresPQ.poll();
        String term = termScore.getKey();
        double score = termScore.getValue();
        resultTermScores.put(term, score);
    }

    return resultTermScores;
}

From source file:io.anserini.rerank.lib.Rm3Reranker.java

License:Apache License

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    assert (docs.documents.length == docs.scores.length);

    IndexSearcher searcher = context.getIndexSearcher();
    IndexReader reader = searcher.getIndexReader();

    FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText()))
            .scaleToUnitL1Norm();/*  w w  w. ja  va2  s.  co  m*/

    FeatureVector rm = estimateRelevanceModel(docs, reader, context.getSearchArgs().searchtweets);

    rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);

    StringBuilder builder = new StringBuilder();
    Iterator<String> terms = rm.iterator();
    while (terms.hasNext()) {
        String term = terms.next();
        double prob = rm.getFeatureWeight(term);
        builder.append(term + "^" + prob + " ");
    }
    String queryText = builder.toString().trim();

    QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
    Query feedbackQuery;
    try {
        feedbackQuery = p.parse(queryText);
    } catch (ParseException e) {
        e.printStackTrace();
        return docs;
    }

    if (this.outputQuery) {
        LOG.info("QID: " + context.getQueryId());
        LOG.info("Original Query: " + context.getQuery().toString(this.field));
        LOG.info("Running new query: " + feedbackQuery.toString(this.field));
    }

    TopDocs rs;
    try {
        Query finalQuery = feedbackQuery;
        // If there's a filter condition, we need to add in the constraint.
        // Otherwise, just use the feedback query.
        if (context.getFilter() != null) {
            BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
            bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
            bqBuilder.add(feedbackQuery, BooleanClause.Occur.MUST);
            finalQuery = bqBuilder.build();
        }

        // Figure out how to break the scoring ties.
        if (context.getSearchArgs().arbitraryScoreTieBreak) {
            rs = searcher.search(finalQuery, context.getSearchArgs().hits);
        } else if (context.getSearchArgs().searchtweets) {
            rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true,
                    true);
        } else {
            rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true,
                    true);
        }
    } catch (IOException e) {
        e.printStackTrace();
        return docs;
    }

    return ScoredDocuments.fromTopDocs(rs, searcher);
}

From source file:io.crate.execution.engine.collect.collectors.LuceneOrderedDocCollector.java

License:Apache License

public LuceneOrderedDocCollector(ShardId shardId, IndexSearcher searcher, Query query, Float minScore,
        boolean doDocsScores, int batchSize, CollectorContext collectorContext,
        Function<FieldDoc, Query> searchAfterQueryOptimize, Sort sort, List<? extends Input<?>> inputs,
        Collection<? extends LuceneCollectorExpression<?>> expressions) {
    super(shardId);
    this.searcher = searcher;
    this.query = query;
    this.minScore = minScore;
    this.doDocsScores = doDocsScores;
    this.batchSize = batchSize;
    this.collectorContext = collectorContext;
    this.searchAfterQueryOptimize = searchAfterQueryOptimize;
    this.sort = sort;
    this.scorer = new DummyScorer();
    this.expressions = expressions;
    this.rowFunction = new ScoreDocRowFunction(searcher.getIndexReader(), inputs, expressions, scorer);
}

From source file:io.crate.operation.collect.collectors.LuceneOrderedDocCollector.java

License:Apache License

public LuceneOrderedDocCollector(ShardId shardId, IndexSearcher searcher, Query query, Float minScore,
        boolean doDocsScores, int batchSize, FieldTypeLookup fieldTypeLookup, CollectorContext collectorContext,
        OrderBy orderBy, Sort sort, List<? extends Input<?>> inputs,
        Collection<? extends LuceneCollectorExpression<?>> expressions) {
    super(shardId);
    this.searcher = searcher;
    this.query = query;
    this.minScore = minScore;
    this.doDocsScores = doDocsScores;
    this.batchSize = batchSize;
    this.fieldTypeLookup = fieldTypeLookup;
    this.collectorContext = collectorContext;
    this.orderBy = orderBy;
    this.sort = sort;
    this.scorer = new DummyScorer();
    this.expressions = expressions;
    this.rowFunction = new ScoreDocRowFunction(searcher.getIndexReader(), inputs, expressions, scorer);
    missingValues = new Object[orderBy.orderBySymbols().size()];
    for (int i = 0; i < orderBy.orderBySymbols().size(); i++) {
        missingValues[i] = LuceneMissingValue.missingValue(orderBy, i);
    }//from w ww.  j ava  2 s  .c om
}

From source file:io.github.msurdi.redeye.core.lucene.AbstractIndex.java

License:Apache License

/**
 * Retrieve a list of documents matching given query. The query must be a valid lucene query or '*'
 * for matching all documents. If the query is not valid, a best effort search is done.
 *
 * @param q a query string//w w w . j  av a 2 s  .c o m
 * @return a list of the {@link io.github.msurdi.redeye.api.Indexable} documents matching.
 * @throws IOException
 */
@Override
public List<T> query(String q) throws IOException {
    ensureOpened();

    ArrayList<T> results = Lists.newArrayList();
    Query query;
    try {
        if (MATCH_ALL.equals(q)) {
            query = new MatchAllDocsQuery();
        } else {
            query = new QueryParser(LUCENE_VERSION, DEFAULT_FIELD, analyzer).parse(q);
        }
    } catch (ParseException e) {
        query = new SimpleQueryParser(analyzer, DEFAULT_FIELD).parse(q);
    }

    IndexSearcher searcher = null;

    try {
        searcherManager.maybeRefresh();
        searcher = searcherManager.acquire();
        TopDocs docs = searcher.search(query, Math.max(1, searcher.getIndexReader().maxDoc()));
        for (ScoreDoc scoreDoc : docs.scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            results.add(buildEntity(document));
        }
    } finally {
        searcherManager.release(searcher);
    }

    return results;
}

From source file:io.github.msurdi.redeye.core.lucene.AbstractIndex.java

License:Apache License

/**
 * Get the number of documents in the lucene
 *
 * @return the number of documents in the lucene
 * @throws IOException//from w  ww  .j av  a 2 s  . c  o m
 */
@Override
public long getCount() throws IOException {
    ensureOpened();
    IndexSearcher searcher = null;
    try {
        searcherManager.maybeRefresh();
        searcher = searcherManager.acquire();
        return searcher.getIndexReader().maxDoc();
    } finally {
        searcherManager.release(searcher);
    }
}

From source file:io.puntanegra.fhir.index.lucene.LuceneRAMIndex.java

License:Apache License

/**
 * Finds the top {@code count} hits for {@code query} and sorting the hits
 * by {@code sort}.//from ww w.  j  a  va2 s . c om
 *
 * @param query
 *            the {@link Query} to search for
 * @param sort
 *            the {@link Sort} to be applied
 * @param count
 *            the max number of results to be collected
 * @param fields
 *            the names of the fields to be loaded
 * @return the found documents
 */
public List<Document> search(Query query, Sort sort, Integer count, Set<String> fields) {
    try {
        indexWriter.commit();
        IndexReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);
        sort = sort.rewrite(searcher);
        TopDocs topDocs = searcher.search(query, count, sort);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        List<Document> documents = new LinkedList<>();
        for (ScoreDoc scoreDoc : scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc, fields);
            documents.add(document);
        }
        searcher.getIndexReader().close();
        return documents;
    } catch (IOException e) {
        throw new FhirIndexException(e, "Error while searching");
    }
}