Example usage for org.apache.lucene.search IndexSearcher doc

List of usage examples for org.apache.lucene.search IndexSearcher doc

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher doc.

Prototype

public Document doc(int docID) throws IOException 

Source Link

Document

Sugar for .getIndexReader().document(docID)

Usage

From source file:io.anserini.search.SearchClueWeb09b.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt
 * @param operator   Default search operator: AND or OR
 * @throws IOException/*w  ww .  j a v  a2 s . c  o  m*/
 * @throws ParseException
 */

public void search(String topicsFile, String submissionFile, QueryParser.Operator operator)
        throws IOException, ParseException {

    Path topicsPath = Paths.get(topicsFile);

    if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));

    QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer());
    queryParser.setDefaultOperator(operator);

    SortedMap<Integer, String> topics = readQueries(topicsPath);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    out.flush();
    out.close();
}

From source file:io.anserini.search.SearchWebCollection.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries/*from   w  ww .j  a  va2s .  c om*/
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */

public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits)
        throws IOException, ParseException {

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);

    final String runTag = "BM25_EnglishAnalyzer_" + FIELD_BODY + "_" + similarity.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));

    QueryParser queryParser = new QueryParser(FIELD_BODY, new EnglishAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.OR);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, numHits).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i + 1);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    out.flush();
    out.close();
}

From source file:io.anserini.search.SimpleSearcher.java

License:Apache License

public Result[] search(String q, int k) throws IOException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);//from w w w.  j ava 2  s  . c  o m
    Query query = AnalyzerUtils.buildBagOfWordsQuery(LuceneDocumentGenerator.FIELD_BODY, analyzer, q);

    TopDocs rs = searcher.search(query, k);
    ScoreDoc[] hits = rs.scoreDocs;

    Result[] results = new Result[hits.length];
    for (int i = 0; i < hits.length; i++) {
        Document doc = searcher.doc(hits[i].doc);
        String docid = doc.getField(LuceneDocumentGenerator.FIELD_ID).stringValue();
        IndexableField field = doc.getField(LuceneDocumentGenerator.FIELD_RAW);
        String content = field == null ? null : field.stringValue();
        results[i] = new Result(docid, hits[i].doc, hits[i].score, content);
    }

    return results;
}

From source file:io.anserini.SearcherCW09B.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt
 * @param operator   Default search operator: AND or OR
 * @throws IOException/* ww  w .ja va 2  s. c  om*/
 * @throws ParseException
 */

public void search(String topicsFile, QueryParser.Operator operator) throws IOException, ParseException {

    Path topicsPath = Paths.get(topicsFile);

    if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString();

    // PrintWriter out = new PrintWriter(Files.newBufferedWriter(path.resolve(runTag + ".txt"), StandardCharsets.US_ASCII));

    PrintStream out = System.out;

    QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer());
    queryParser.setDefaultOperator(operator);

    SortedMap<Integer, String> topics = readQueries(topicsPath);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    // out.flush();
    // out.close();
}

From source file:io.datalayer.lucene.helper.AosLuceneUtil.java

License:Apache License

public static boolean hitsIncludeTitle(IndexSearcher searcher, TopDocs hits, String title) throws IOException {
    for (ScoreDoc match : hits.scoreDocs) {
        Document doc = searcher.doc(match.doc);
        if (title.equals(doc.get("title"))) {
            return true;
        }/*from  www.j a v  a  2  s .c  o m*/
    }
    LOGGER.info("title '" + title + "' not found");
    return false;
}

From source file:io.datalayer.lucene.helper.AosLuceneUtil.java

License:Apache License

public static void dumpHits(IndexSearcher searcher, TopDocs hits) throws IOException {
    if (hits.totalHits == 0) {
        LOGGER.info("No hits");
    }/*  www.  j a v a 2s. co m*/
    for (ScoreDoc match : hits.scoreDocs) {
        Document doc = searcher.doc(match.doc);
        LOGGER.info(match.score + ":" + doc.get("title"));
    }
}

From source file:io.datalayer.lucene.index.LuceneSimple.java

License:Apache License

private static void search(String indexpath, String keyword) throws Exception, IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexpath)));
    IndexSearcher searcher = new IndexSearcher(reader);
    LOGGER.info("Search  keyword " + keyword);
    Query query = new QueryParser(Version.LUCENE_46, "content", new StandardAnalyzer(Version.LUCENE_46))
            .parse(keyword);/*from   w ww . j  a  v a  2s  . com*/

    TopDocs docs = searcher.search(query, 10);
    LOGGER.info("hits " + docs.totalHits);
    for (ScoreDoc doc : docs.scoreDocs) {
        LOGGER.info("doc id" + doc.doc + "doc filename" + searcher.doc(doc.doc).get("filename"));
    }

}

From source file:io.datalayer.lucene.search.LuceneQueryTest.java

License:Apache License

private static void query(String indexDir, Query q) throws IOException, ParseException {

    int hitsPerPage = 10;
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    TopDocsCollector collector = TopScoreDocCollector.create(hitsPerPage, false);
    indexSearcher.search(q, collector);//from   w ww  .  j a  v  a  2s .  co  m
    ScoreDoc[] hits = collector.topDocs().scoreDocs;

    LOGGER.info("Found " + hits.length + " hits.");
    for (int i = 0; i < hits.length; ++i) {
        int docId = hits[i].doc;
        Document d = indexSearcher.doc(docId);
        // LOGGER.info((i + 1) + ". " + d.get("title"));
    }

    // searcher can only be closed when there
    // is no need to access the documents any more.
    // indexSearcher.close();

}

From source file:io.datalayer.lucene.search.LuceneSearchTest.java

License:Apache License

private void queryIndex(Query query, String fieldname) throws CorruptIndexException, IOException {

    LOGGER.info("-------------------------------------");

    long start = java.util.Calendar.getInstance().getTimeInMillis();

    int hitsPerPage = 100;

    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    TopDocsCollector collector = TopScoreDocCollector.create(hitsPerPage, false);

    indexSearcher.search(query, collector);

    ScoreDoc[] hits = collector.topDocs().scoreDocs;

    long end = java.util.Calendar.getInstance().getTimeInMillis();

    // float duration = (end - start) / 1000;

    LOGGER.info("Found " + hits.length + " hits in " + (end - start) + " milliseconds");
    for (int i = 0; i < hits.length; ++i) {
        int docId = hits[i].doc;
        Document document = indexSearcher.doc(docId);
        LOGGER.info((i + 1) + ". " + document.get(fieldname));
    }/*from   ww w  . j  a  va 2  s .c o  m*/

}

From source file:io.datalayer.lucene.search.SimpleSearcherMain.java

License:Apache License

public static void search(String indexDir, String q) throws IOException, ParseException {

    Directory dir = FSDirectory.open(new File(indexDir));
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher is = new IndexSearcher(reader);

    QueryParser parser = new QueryParser(Version.LUCENE_46, "contents",
            new StandardAnalyzer(Version.LUCENE_46));
    Query query = parser.parse(q);
    long start = System.currentTimeMillis();
    TopDocs hits = is.search(query, 10);
    long end = System.currentTimeMillis();

    System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start)
            + " milliseconds) that matched query '" + q + "':");

    for (ScoreDoc scoreDoc : hits.scoreDocs) {
        Document doc = is.doc(scoreDoc.doc);
        LOGGER.info(doc.get("fullpath"));
    }//from  w  ww  .java  2  s .co  m

    reader.close();

}