List of usage examples for org.apache.lucene.search IndexSearcher doc
public Document doc(int docID) throws IOException
.getIndexReader().document(docID)
From source file:io.anserini.search.SearchClueWeb09b.java
License:Apache License
/** * Prints TREC submission file to the standard output stream. * * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt * @param operator Default search operator: AND or OR * @throws IOException/*w ww . j a v a2 s . c o m*/ * @throws ParseException */ public void search(String topicsFile, String submissionFile, QueryParser.Operator operator) throws IOException, ParseException { Path topicsPath = Paths.get(topicsFile); if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) { throw new IllegalArgumentException( "Topics file : " + topicsFile + " does not exist or is not a (readable) file."); } IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString(); PrintWriter out = new PrintWriter( Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII)); QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer()); queryParser.setDefaultOperator(operator); SortedMap<Integer, String> topics = readQueries(topicsPath); for (Map.Entry<Integer, String> entry : topics.entrySet()) { int qID = entry.getKey(); String queryString = entry.getValue(); Query query = queryParser.parse(queryString); /** * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query. */ ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; /** * the first column is the topic number. * the second column is currently unused and should always be "Q0". * the third column is the official document identifier of the retrieved document. * the fourth column is the rank the document is retrieved. * the fifth column shows the score (integer or floating point) that generated the ranking. * the sixth column is called the "run tag" and should be a unique identifier for your */ for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); out.print(qID); out.print("\tQ0\t"); out.print(doc.get(FIELD_ID)); out.print("\t"); out.print(i); out.print("\t"); out.print(hits[i].score); out.print("\t"); out.print(runTag); out.println(); } } out.flush(); out.close(); }
From source file:io.anserini.search.SearchWebCollection.java
License:Apache License
/** * Prints TREC submission file to the standard output stream. * * @param topics queries/*from w ww .j a va2s . c om*/ * @param similarity similarity * @throws IOException * @throws ParseException */ public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits) throws IOException, ParseException { IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(similarity); final String runTag = "BM25_EnglishAnalyzer_" + FIELD_BODY + "_" + similarity.toString(); PrintWriter out = new PrintWriter( Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII)); QueryParser queryParser = new QueryParser(FIELD_BODY, new EnglishAnalyzer()); queryParser.setDefaultOperator(QueryParser.Operator.OR); for (Map.Entry<Integer, String> entry : topics.entrySet()) { int qID = entry.getKey(); String queryString = entry.getValue(); Query query = queryParser.parse(queryString); /** * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query. */ ScoreDoc[] hits = searcher.search(query, numHits).scoreDocs; /** * the first column is the topic number. * the second column is currently unused and should always be "Q0". * the third column is the official document identifier of the retrieved document. * the fourth column is the rank the document is retrieved. * the fifth column shows the score (integer or floating point) that generated the ranking. * the sixth column is called the "run tag" and should be a unique identifier for your */ for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); out.print(qID); out.print("\tQ0\t"); out.print(doc.get(FIELD_ID)); out.print("\t"); out.print(i + 1); out.print("\t"); out.print(hits[i].score); out.print("\t"); out.print(runTag); out.println(); } } out.flush(); out.close(); }
From source file:io.anserini.search.SimpleSearcher.java
License:Apache License
public Result[] search(String q, int k) throws IOException { IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(similarity);//from w w w. j ava 2 s . c o m Query query = AnalyzerUtils.buildBagOfWordsQuery(LuceneDocumentGenerator.FIELD_BODY, analyzer, q); TopDocs rs = searcher.search(query, k); ScoreDoc[] hits = rs.scoreDocs; Result[] results = new Result[hits.length]; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); String docid = doc.getField(LuceneDocumentGenerator.FIELD_ID).stringValue(); IndexableField field = doc.getField(LuceneDocumentGenerator.FIELD_RAW); String content = field == null ? null : field.stringValue(); results[i] = new Result(docid, hits[i].doc, hits[i].score, content); } return results; }
From source file:io.anserini.SearcherCW09B.java
License:Apache License
/** * Prints TREC submission file to the standard output stream. * * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt * @param operator Default search operator: AND or OR * @throws IOException/* ww w .ja va 2 s. c om*/ * @throws ParseException */ public void search(String topicsFile, QueryParser.Operator operator) throws IOException, ParseException { Path topicsPath = Paths.get(topicsFile); if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) { throw new IllegalArgumentException( "Topics file : " + topicsFile + " does not exist or is not a (readable) file."); } IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString(); // PrintWriter out = new PrintWriter(Files.newBufferedWriter(path.resolve(runTag + ".txt"), StandardCharsets.US_ASCII)); PrintStream out = System.out; QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer()); queryParser.setDefaultOperator(operator); SortedMap<Integer, String> topics = readQueries(topicsPath); for (Map.Entry<Integer, String> entry : topics.entrySet()) { int qID = entry.getKey(); String queryString = entry.getValue(); Query query = queryParser.parse(queryString); /** * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query. */ ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; /** * the first column is the topic number. * the second column is currently unused and should always be "Q0". * the third column is the official document identifier of the retrieved document. * the fourth column is the rank the document is retrieved. * the fifth column shows the score (integer or floating point) that generated the ranking. * the sixth column is called the "run tag" and should be a unique identifier for your */ for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); out.print(qID); out.print("\tQ0\t"); out.print(doc.get(FIELD_ID)); out.print("\t"); out.print(i); out.print("\t"); out.print(hits[i].score); out.print("\t"); out.print(runTag); out.println(); } } // out.flush(); // out.close(); }
From source file:io.datalayer.lucene.helper.AosLuceneUtil.java
License:Apache License
public static boolean hitsIncludeTitle(IndexSearcher searcher, TopDocs hits, String title) throws IOException { for (ScoreDoc match : hits.scoreDocs) { Document doc = searcher.doc(match.doc); if (title.equals(doc.get("title"))) { return true; }/*from www.j a v a 2 s .c o m*/ } LOGGER.info("title '" + title + "' not found"); return false; }
From source file:io.datalayer.lucene.helper.AosLuceneUtil.java
License:Apache License
public static void dumpHits(IndexSearcher searcher, TopDocs hits) throws IOException { if (hits.totalHits == 0) { LOGGER.info("No hits"); }/* www. j a v a 2s. co m*/ for (ScoreDoc match : hits.scoreDocs) { Document doc = searcher.doc(match.doc); LOGGER.info(match.score + ":" + doc.get("title")); } }
From source file:io.datalayer.lucene.index.LuceneSimple.java
License:Apache License
private static void search(String indexpath, String keyword) throws Exception, IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexpath))); IndexSearcher searcher = new IndexSearcher(reader); LOGGER.info("Search keyword " + keyword); Query query = new QueryParser(Version.LUCENE_46, "content", new StandardAnalyzer(Version.LUCENE_46)) .parse(keyword);/*from w ww . j a v a 2s . com*/ TopDocs docs = searcher.search(query, 10); LOGGER.info("hits " + docs.totalHits); for (ScoreDoc doc : docs.scoreDocs) { LOGGER.info("doc id" + doc.doc + "doc filename" + searcher.doc(doc.doc).get("filename")); } }
From source file:io.datalayer.lucene.search.LuceneQueryTest.java
License:Apache License
private static void query(String indexDir, Query q) throws IOException, ParseException { int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir))); IndexSearcher indexSearcher = new IndexSearcher(reader); TopDocsCollector collector = TopScoreDocCollector.create(hitsPerPage, false); indexSearcher.search(q, collector);//from w ww . j a v a 2s . co m ScoreDoc[] hits = collector.topDocs().scoreDocs; LOGGER.info("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = indexSearcher.doc(docId); // LOGGER.info((i + 1) + ". " + d.get("title")); } // searcher can only be closed when there // is no need to access the documents any more. // indexSearcher.close(); }
From source file:io.datalayer.lucene.search.LuceneSearchTest.java
License:Apache License
private void queryIndex(Query query, String fieldname) throws CorruptIndexException, IOException { LOGGER.info("-------------------------------------"); long start = java.util.Calendar.getInstance().getTimeInMillis(); int hitsPerPage = 100; IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); TopDocsCollector collector = TopScoreDocCollector.create(hitsPerPage, false); indexSearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; long end = java.util.Calendar.getInstance().getTimeInMillis(); // float duration = (end - start) / 1000; LOGGER.info("Found " + hits.length + " hits in " + (end - start) + " milliseconds"); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document document = indexSearcher.doc(docId); LOGGER.info((i + 1) + ". " + document.get(fieldname)); }/*from ww w . j a va 2 s .c o m*/ }
From source file:io.datalayer.lucene.search.SimpleSearcherMain.java
License:Apache License
public static void search(String indexDir, String q) throws IOException, ParseException { Directory dir = FSDirectory.open(new File(indexDir)); IndexReader reader = DirectoryReader.open(dir); IndexSearcher is = new IndexSearcher(reader); QueryParser parser = new QueryParser(Version.LUCENE_46, "contents", new StandardAnalyzer(Version.LUCENE_46)); Query query = parser.parse(q); long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); LOGGER.info(doc.get("fullpath")); }//from w ww .java 2 s .co m reader.close(); }