List of usage examples for org.apache.lucene.search IndexSearcher setSimilarity
public void setSimilarity(Similarity similarity)
From source file:fr.lipn.yasemir.ontology.annotation.SentenceBasedAnnotator.java
License:Open Source License
/** * Implementation of the annotate method by IndexBasedAnnotator. * /*from ww w . j a v a 2s .c o m*/ * The input text is splitted in fragments according to punctuation; * every fragment is used as a query and sent to a Lucene SE that * was used to index the terminology (BM25 weight). * Up to the 20 top results returned by the system are taken as the annotation for the * fragment text. All the fragment annotations combined compose the document annotation * that is returned by this method. * */ public DocumentAnnotation annotate(String document) { DocumentAnnotation ret = new DocumentAnnotation(); try { IndexReader reader = IndexReader.open(FSDirectory.open(new File(termIndexPath))); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); /* document=document.replaceAll("\\[.*?\\]", "").trim(); //document = document.replaceAll( "\\p{Punct}", " " ); String [] fragments = document.split("[;:\\.,]"); */ String[] fragments = (String[]) getSentences(document).toArray(); for (String ofragment : fragments) { ofragment = ofragment.replaceAll("\\p{Punct}", " "); ofragment = ofragment.trim(); String sa[] = ofragment.split("(?<=[ \\n])"); EnglishStemmer st = new EnglishStemmer(); StringBuffer fbuf = new StringBuffer(); for (String s : sa) { st.setCurrent(s.trim()); st.stem(); fbuf.append(st.getCurrent()); fbuf.append(" "); } String fragment = fbuf.toString().trim(); //stemmed fragment if (fragment.length() == 0) continue; //System.err.println("Annotating: "+fragment); QueryParser parser = new QueryParser(Version.LUCENE_44, "labels", Yasemir.analyzer); Query query = parser.parse(fragment); String stemmedFragment = query.toString("labels").replaceAll("labels:", ""); TopDocs results = searcher.search(query, 20); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; //System.err.println(numTotalHits + " total matching classes"); if (numTotalHits > 0) { hits = searcher.search(query, numTotalHits).scoreDocs; for (int i = 0; i < Math.min(numTotalHits, MAX_ANNOTS); i++) { Document doc = searcher.doc(hits[i].doc); String ptrn = "(?i)(" + doc.get("labels").replaceAll(", ", "|") + ")"; //System.err.println("OWLClass="+doc.get("id")+" score="+hits[i].score); if (Tools.checkPattern(stemmedFragment, ptrn)) { //System.err.println("OK: OWLClass="+doc.get("id")+" score="+hits[i].score); Annotation ann = new Annotation(doc.get("id")); String ontoID = ann.getRelatedOntology().getOntologyID(); Vector<Annotation> annotations = ret.get(ontoID); if (annotations == null) annotations = new Vector<Annotation>(); annotations.add(ann); ret.put(ontoID, annotations); } } } } reader.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:framework.retrieval.engine.query.RQuery.java
License:Apache License
/** * ?/*from w w w . j ava 2 s .c om*/ * @param analyzerFactory * ? * * @param highlighterFactory * ? * * @param queryResultTopDocsNum * ? * * @param baseIndexPath * ? * * @param indexReaders * */ public RQuery(IRAnalyzerFactory analyzerFactory, IHighlighterFactory highlighterFactory, int queryResultTopDocsNum, String baseIndexPath, IndexReaderProxy[] indexReaderProxys) { this.analyzerFactory = analyzerFactory; this.highlighterFactory = highlighterFactory; this.queryResultTopDocsNum = queryResultTopDocsNum; this.indexReaderProxys = indexReaderProxys; int length = indexReaderProxys.length; if (length > 1) { IndexSearcher[] searchers = new IndexSearcher[length]; for (int i = 0; i < length; i++) { IndexSearcher searcher = null; try { searcher = new IndexSearcher(indexReaderProxys[i].getIndexReader()); } catch (Exception e) { throw new RetrievalQueryException(e); } searchers[i] = searcher; } try { searcher = new MultiSearcher(searchers); } catch (Exception e) { throw new RetrievalQueryException(e); } } else { try { searcher = new IndexSearcher(indexReaderProxys[0].getIndexReader()); } catch (Exception e) { throw new RetrievalQueryException(e); } } searcher.setSimilarity(analyzerFactory.createSimilarity()); }
From source file:HW1.SearchFiles.java
License:Apache License
public static void main(String[] args) throws Exception { String queryString = "dislike football"; String indexPath = "/Users/yangyang/Desktop/lucene/hw1/index/index04"; IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); searcher.setSimilarity(new BM25Similarity()); QueryParser parser = new QueryParser("TEXT", analyzer); Query query = parser.parse(queryString); System.out.println("Searching for: " + query.toString("TEXT")); TopDocs results = searcher.search(query, 10); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.println("DOCNO: " + doc.get("DOCNO")); }/*from w w w .j av a2s . c om*/ reader.close(); }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testIterateThroughDocumentVectorComputeBM25() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); int numDocs = reader.numDocs(); // Iterate through the document vectors for (int i = 0; i < numDocs; i++) { String docid = reader.document(i).getField("docid").stringValue(); System.out.println(reader.document(i)); System.out.println(i + ": " + docid); Terms terms = reader.getTermVector(i, "text"); TermsEnum te = terms.iterator(); // For this document, iterate through the terms. while (te.next() != null) { String term = new Term("text", te.term()).bytes().utf8ToString(); long tf = te.totalTermFreq(); // The way to compute the BM25 score is to issue a query with the exact docid and the // term in question, and look at the retrieval score. Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid Query termQuery = new TermQuery(new Term("text", term)); // the term BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both builder.add(filterQuery, BooleanClause.Occur.MUST); builder.add(termQuery, BooleanClause.Occur.MUST); Query finalQuery = builder.build(); TopDocs rs = searcher.search(finalQuery, 1); // issue the query // The BM25 weight is the maxScore System.out.println(term + " " + tf + " " + rs.getMaxScore()); }/* w ww. ja va 2 s .c o m*/ } }
From source file:io.anserini.qa.RetrieveSentences.java
License:Apache License
public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException { IndexSearcher searcher = new IndexSearcher(reader); //using BM25 scoring model Similarity similarity = new BM25Similarity(0.9f, 0.4f); searcher.setSimilarity(similarity); EnglishAnalyzer ea = new EnglishAnalyzer(); QueryParser queryParser = new QueryParser(FIELD_BODY, ea); queryParser.setDefaultOperator(QueryParser.Operator.OR); Map<String, Float> scoredDocs = new LinkedHashMap<>(); for (Map.Entry<Integer, String> entry : topics.entrySet()) { int qID = entry.getKey(); String queryString = entry.getValue(); Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString); TopDocs rs = searcher.search(query, numHits); ScoreDoc[] hits = rs.scoreDocs;/* w w w . ja v a 2 s.c o m*/ ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher); for (int i = 0; i < docs.documents.length; i++) { scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]); } } return scoredDocs; }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * If the external reranking context is not null we will first search against the external * index and return the top ranked documents. * * @param docs The initial ranking results against target index. We will return them if external * index is null./*from w ww.ja v a 2 s . c o m*/ * * @return Top ranked ScoredDocuments from searching external index */ private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerContext<T> context) throws IOException { if (externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath)); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(context.getIndexSearcher().getSimilarity(true)); SearchArgs args = new SearchArgs(); args.hits = this.R; args.arbitraryScoreTieBreak = context.getSearchArgs().arbitraryScoreTieBreak; args.searchtweets = context.getSearchArgs().searchtweets; RerankerContext<T> externalContext = new RerankerContext<>(searcher, context.getQueryId(), context.getQuery(), context.getQueryText(), context.getQueryTokens(), context.getFilter(), args); return searchTopDocs(null, externalContext); } else { return docs; } }
From source file:io.anserini.search.SearchClueWeb09b.java
License:Apache License
/** * Prints TREC submission file to the standard output stream. * * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt * @param operator Default search operator: AND or OR * @throws IOException/*from w w w .j av a 2s .c o m*/ * @throws ParseException */ public void search(String topicsFile, String submissionFile, QueryParser.Operator operator) throws IOException, ParseException { Path topicsPath = Paths.get(topicsFile); if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) { throw new IllegalArgumentException( "Topics file : " + topicsFile + " does not exist or is not a (readable) file."); } IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString(); PrintWriter out = new PrintWriter( Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII)); QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer()); queryParser.setDefaultOperator(operator); SortedMap<Integer, String> topics = readQueries(topicsPath); for (Map.Entry<Integer, String> entry : topics.entrySet()) { int qID = entry.getKey(); String queryString = entry.getValue(); Query query = queryParser.parse(queryString); /** * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query. */ ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; /** * the first column is the topic number. * the second column is currently unused and should always be "Q0". * the third column is the official document identifier of the retrieved document. * the fourth column is the rank the document is retrieved. * the fifth column shows the score (integer or floating point) that generated the ranking. * the sixth column is called the "run tag" and should be a unique identifier for your */ for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); out.print(qID); out.print("\tQ0\t"); out.print(doc.get(FIELD_ID)); out.print("\t"); out.print(i); out.print("\t"); out.print(hits[i].score); out.print("\t"); out.print(runTag); out.println(); } } out.flush(); out.close(); }
From source file:io.anserini.search.SearchCollection.java
License:Apache License
@SuppressWarnings("unchecked") public <K> int runTopics() throws IOException { IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(similarity); Path topicsFile = Paths.get(args.topics); if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) { throw new IllegalArgumentException( "Topics file : " + topicsFile + " does not exist or is not a (readable) file."); }/*www . j a v a 2s . c o m*/ TopicReader<K> tr; SortedMap<K, Map<String, String>> topics; try { tr = (TopicReader<K>) Class.forName("io.anserini.search.query." + args.topicReader + "TopicReader") .getConstructor(Path.class).newInstance(topicsFile); topics = tr.read(); } catch (Exception e) { throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader); } final String runTag = "Anserini_" + args.topicfield + "_" + (args.keepstop ? "KeepStopwords_" : "") + FIELD_BODY + "_" + (args.searchtweets ? "SearchTweets_" : "") + similarity.toString(); PrintWriter out = new PrintWriter( Files.newBufferedWriter(Paths.get(args.output), StandardCharsets.US_ASCII)); for (Map.Entry<K, Map<String, String>> entry : topics.entrySet()) { K qid = entry.getKey(); String queryString = entry.getValue().get(args.topicfield); ScoredDocuments docs; if (args.searchtweets) { docs = searchTweets(searcher, qid, queryString, Long.parseLong(entry.getValue().get("time"))); } else { docs = search(searcher, qid, queryString); } /** * the first column is the topic number. * the second column is currently unused and should always be "Q0". * the third column is the official document identifier of the retrieved document. * the fourth column is the rank the document is retrieved. * the fifth column shows the score (integer or floating point) that generated the ranking. * the sixth column is called the "run tag" and should be a unique identifier for your */ for (int i = 0; i < docs.documents.length; i++) { out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], ((i == 0 || i == docs.documents.length - 1) ? runTag : "See_Line1"))); } } out.flush(); out.close(); return topics.size(); }
From source file:io.anserini.search.SearchTweets.java
License:Apache License
public static void main(String[] args) throws Exception { long curTime = System.nanoTime(); SearchArgs searchArgs = new SearchArgs(); CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90)); try {//from w w w .j a v a 2s .c om parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } LOG.info("Reading index at " + searchArgs.index); Directory dir; if (searchArgs.inmem) { LOG.info("Using MMapDirectory with preload"); dir = new MMapDirectory(Paths.get(searchArgs.index)); ((MMapDirectory) dir).setPreload(true); } else { LOG.info("Using default FSDirectory"); dir = FSDirectory.open(Paths.get(searchArgs.index)); } IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); if (searchArgs.ql) { LOG.info("Using QL scoring model"); searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu)); } else if (searchArgs.bm25) { LOG.info("Using BM25 scoring model"); searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b)); } else { LOG.error("Error: Must specify scoring model!"); System.exit(-1); } RerankerCascade cascade = new RerankerCascade(); if (searchArgs.rm3) { cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt")); cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } else { cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics)); PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output))); LOG.info("Writing output to " + searchArgs.output); LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)"); long totalTime = 0; int cnt = 0; for (MicroblogTopic topic : topics) { long curQueryTime = System.nanoTime(); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery()); TopDocs rs = searcher.search(query, filter, searchArgs.hits); RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), Sets.newHashSet(AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery())), filter); ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context); for (int i = 0; i < docs.documents.length; i++) { String qid = topic.getId().replaceFirst("^MB0*", ""); out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], searchArgs.runtag)); } long qtime = (System.nanoTime() - curQueryTime) / 1000000; LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)"); totalTime += qtime; cnt++; } LOG.info("All queries completed!"); LOG.info("Total elapsed time = " + totalTime + "ms"); LOG.info("Average query latency = " + (totalTime / cnt) + "ms"); reader.close(); out.close(); }
From source file:io.anserini.search.SearchWebCollection.java
License:Apache License
/** * Prints TREC submission file to the standard output stream. * * @param topics queries//www .j a va 2 s .co m * @param similarity similarity * @throws IOException * @throws ParseException */ public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits) throws IOException, ParseException { IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(similarity); final String runTag = "BM25_EnglishAnalyzer_" + FIELD_BODY + "_" + similarity.toString(); PrintWriter out = new PrintWriter( Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII)); QueryParser queryParser = new QueryParser(FIELD_BODY, new EnglishAnalyzer()); queryParser.setDefaultOperator(QueryParser.Operator.OR); for (Map.Entry<Integer, String> entry : topics.entrySet()) { int qID = entry.getKey(); String queryString = entry.getValue(); Query query = queryParser.parse(queryString); /** * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query. */ ScoreDoc[] hits = searcher.search(query, numHits).scoreDocs; /** * the first column is the topic number. * the second column is currently unused and should always be "Q0". * the third column is the official document identifier of the retrieved document. * the fourth column is the rank the document is retrieved. * the fifth column shows the score (integer or floating point) that generated the ranking. * the sixth column is called the "run tag" and should be a unique identifier for your */ for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document doc = searcher.doc(docId); out.print(qID); out.print("\tQ0\t"); out.print(doc.get(FIELD_ID)); out.print("\t"); out.print(i + 1); out.print("\t"); out.print(hits[i].score); out.print("\t"); out.print(runTag); out.println(); } } out.flush(); out.close(); }