Example usage for org.apache.lucene.search IndexSearcher setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher setSimilarity.

Prototype

public void setSimilarity(Similarity similarity)

Source Link

Document

Expert: Set the Similarity implementation used by this IndexSearcher.

Usage

From source file:fr.lipn.yasemir.ontology.annotation.SentenceBasedAnnotator.java

License:Open Source License

/**
 * Implementation of the annotate method by IndexBasedAnnotator.
 * /*from   ww  w .  j a  v a 2s  .c  o m*/
 * The input text is splitted in fragments according to punctuation;
 * every fragment is used as a query and sent to a Lucene SE that
 * was used to index the terminology (BM25 weight).
 * Up to the 20 top results returned by the system are taken as the annotation for the
 * fragment text. All the fragment annotations combined compose the document annotation
 * that is returned by this method.
 * 
 */
public DocumentAnnotation annotate(String document) {
    DocumentAnnotation ret = new DocumentAnnotation();

    try {
        IndexReader reader = IndexReader.open(FSDirectory.open(new File(termIndexPath)));
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(new BM25Similarity());

        /*
        document=document.replaceAll("\\[.*?\\]", "").trim();
        //document = document.replaceAll( "\\p{Punct}", " " );
        String [] fragments = document.split("[;:\\.,]");
        */

        String[] fragments = (String[]) getSentences(document).toArray();

        for (String ofragment : fragments) {
            ofragment = ofragment.replaceAll("\\p{Punct}", " ");
            ofragment = ofragment.trim();
            String sa[] = ofragment.split("(?<=[ \\n])");
            EnglishStemmer st = new EnglishStemmer();
            StringBuffer fbuf = new StringBuffer();
            for (String s : sa) {
                st.setCurrent(s.trim());
                st.stem();
                fbuf.append(st.getCurrent());
                fbuf.append(" ");
            }

            String fragment = fbuf.toString().trim(); //stemmed fragment

            if (fragment.length() == 0)
                continue;
            //System.err.println("Annotating: "+fragment);

            QueryParser parser = new QueryParser(Version.LUCENE_44, "labels", Yasemir.analyzer);
            Query query = parser.parse(fragment);
            String stemmedFragment = query.toString("labels").replaceAll("labels:", "");

            TopDocs results = searcher.search(query, 20);
            ScoreDoc[] hits = results.scoreDocs;

            int numTotalHits = results.totalHits;
            //System.err.println(numTotalHits + " total matching classes");

            if (numTotalHits > 0) {
                hits = searcher.search(query, numTotalHits).scoreDocs;
                for (int i = 0; i < Math.min(numTotalHits, MAX_ANNOTS); i++) {
                    Document doc = searcher.doc(hits[i].doc);
                    String ptrn = "(?i)(" + doc.get("labels").replaceAll(", ", "|") + ")";
                    //System.err.println("OWLClass="+doc.get("id")+" score="+hits[i].score);
                    if (Tools.checkPattern(stemmedFragment, ptrn)) {
                        //System.err.println("OK: OWLClass="+doc.get("id")+" score="+hits[i].score);
                        Annotation ann = new Annotation(doc.get("id"));
                        String ontoID = ann.getRelatedOntology().getOntologyID();

                        Vector<Annotation> annotations = ret.get(ontoID);
                        if (annotations == null)
                            annotations = new Vector<Annotation>();
                        annotations.add(ann);
                        ret.put(ontoID, annotations);
                    }
                }
            }

        }
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;

}

From source file:framework.retrieval.engine.query.RQuery.java

License:Apache License

/**
 * ?/*from   w w  w  .  j ava  2  s  .c om*/
 * @param analyzerFactory
 *            ?
 *            
 * @param highlighterFactory
 *            ?
 *            
 * @param queryResultTopDocsNum
 *            ?
 *            
 * @param baseIndexPath
 *            ?
 *            
 * @param indexReaders
 *            
 */
public RQuery(IRAnalyzerFactory analyzerFactory, IHighlighterFactory highlighterFactory,
        int queryResultTopDocsNum, String baseIndexPath, IndexReaderProxy[] indexReaderProxys) {
    this.analyzerFactory = analyzerFactory;
    this.highlighterFactory = highlighterFactory;
    this.queryResultTopDocsNum = queryResultTopDocsNum;
    this.indexReaderProxys = indexReaderProxys;

    int length = indexReaderProxys.length;
    if (length > 1) {
        IndexSearcher[] searchers = new IndexSearcher[length];
        for (int i = 0; i < length; i++) {
            IndexSearcher searcher = null;
            try {
                searcher = new IndexSearcher(indexReaderProxys[i].getIndexReader());
            } catch (Exception e) {
                throw new RetrievalQueryException(e);
            }
            searchers[i] = searcher;
        }
        try {
            searcher = new MultiSearcher(searchers);
        } catch (Exception e) {
            throw new RetrievalQueryException(e);
        }
    } else {
        try {
            searcher = new IndexSearcher(indexReaderProxys[0].getIndexReader());
        } catch (Exception e) {
            throw new RetrievalQueryException(e);
        }
    }
    searcher.setSimilarity(analyzerFactory.createSimilarity());
}

From source file:HW1.SearchFiles.java

License:Apache License

public static void main(String[] args) throws Exception {
    String queryString = "dislike football";

    String indexPath = "/Users/yangyang/Desktop/lucene/hw1/index/index04";
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));

    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer();
    searcher.setSimilarity(new BM25Similarity());

    QueryParser parser = new QueryParser("TEXT", analyzer);
    Query query = parser.parse(queryString);
    System.out.println("Searching for: " + query.toString("TEXT"));

    TopDocs results = searcher.search(query, 10);
    ScoreDoc[] hits = results.scoreDocs;

    int numTotalHits = results.totalHits;
    System.out.println(numTotalHits + " total matching documents");

    for (int i = 0; i < hits.length; i++) {
        Document doc = searcher.doc(hits[i].doc);
        System.out.println("DOCNO: " + doc.get("DOCNO"));
    }/*from w  w  w .j av  a2s  . c om*/

    reader.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        String docid = reader.document(i).getField("docid").stringValue();
        System.out.println(reader.document(i));
        System.out.println(i + ": " + docid);
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        while (te.next() != null) {
            String term = new Term("text", te.term()).bytes().utf8ToString();
            long tf = te.totalTermFreq();

            // The way to compute the BM25 score is to issue a query with the exact docid and the
            // term in question, and look at the retrieval score.
            Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid
            Query termQuery = new TermQuery(new Term("text", term)); // the term
            BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both
            builder.add(filterQuery, BooleanClause.Occur.MUST);
            builder.add(termQuery, BooleanClause.Occur.MUST);
            Query finalQuery = builder.build();
            TopDocs rs = searcher.search(finalQuery, 1); // issue the query

            // The BM25 weight is the maxScore
            System.out.println(term + " " + tf + " " + rs.getMaxScore());
        }/*  w ww.  ja va 2 s .c o  m*/
    }
}

From source file:io.anserini.qa.RetrieveSentences.java

License:Apache License

public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits)
        throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);

    //using BM25 scoring model
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    searcher.setSimilarity(similarity);

    EnglishAnalyzer ea = new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Map<String, Float> scoredDocs = new LinkedHashMap<>();

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);

        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;/* w  w w .  ja v a  2  s.c  o m*/
        ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);

        for (int i = 0; i < docs.documents.length; i++) {
            scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
        }
    }
    return scoredDocs;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * If the external reranking context is not null we will first search against the external
 * index and return the top ranked documents.
 *
 * @param docs The initial ranking results against target index. We will return them if external
 *             index is null./*from w  ww.ja v  a 2  s  .  c  o  m*/
 *
 * @return Top ranked ScoredDocuments from searching external index
 */
private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerContext<T> context)
        throws IOException {
    if (externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(context.getIndexSearcher().getSimilarity(true));

        SearchArgs args = new SearchArgs();
        args.hits = this.R;
        args.arbitraryScoreTieBreak = context.getSearchArgs().arbitraryScoreTieBreak;
        args.searchtweets = context.getSearchArgs().searchtweets;

        RerankerContext<T> externalContext = new RerankerContext<>(searcher, context.getQueryId(),
                context.getQuery(), context.getQueryText(), context.getQueryTokens(), context.getFilter(),
                args);

        return searchTopDocs(null, externalContext);
    } else {
        return docs;
    }
}

From source file:io.anserini.search.SearchClueWeb09b.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt
 * @param operator   Default search operator: AND or OR
 * @throws IOException/*from w  w w  .j  av  a 2s .c  o  m*/
 * @throws ParseException
 */

public void search(String topicsFile, String submissionFile, QueryParser.Operator operator)
        throws IOException, ParseException {

    Path topicsPath = Paths.get(topicsFile);

    if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));

    QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer());
    queryParser.setDefaultOperator(operator);

    SortedMap<Integer, String> topics = readQueries(topicsPath);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    out.flush();
    out.close();
}

From source file:io.anserini.search.SearchCollection.java

License:Apache License

@SuppressWarnings("unchecked")
public <K> int runTopics() throws IOException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);

    Path topicsFile = Paths.get(args.topics);

    if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }/*www . j a  v a 2s  .  c  o  m*/

    TopicReader<K> tr;
    SortedMap<K, Map<String, String>> topics;
    try {
        tr = (TopicReader<K>) Class.forName("io.anserini.search.query." + args.topicReader + "TopicReader")
                .getConstructor(Path.class).newInstance(topicsFile);
        topics = tr.read();
    } catch (Exception e) {
        throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader);
    }

    final String runTag = "Anserini_" + args.topicfield + "_" + (args.keepstop ? "KeepStopwords_" : "")
            + FIELD_BODY + "_" + (args.searchtweets ? "SearchTweets_" : "") + similarity.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(args.output), StandardCharsets.US_ASCII));

    for (Map.Entry<K, Map<String, String>> entry : topics.entrySet()) {
        K qid = entry.getKey();
        String queryString = entry.getValue().get(args.topicfield);

        ScoredDocuments docs;
        if (args.searchtweets) {
            docs = searchTweets(searcher, qid, queryString, Long.parseLong(entry.getValue().get("time")));
        } else {
            docs = search(searcher, qid, queryString);
        }

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < docs.documents.length; i++) {
            out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid,
                    docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i],
                    ((i == 0 || i == docs.documents.length - 1) ? runTag : "See_Line1")));
        }
    }
    out.flush();
    out.close();

    return topics.size();
}

From source file:io.anserini.search.SearchTweets.java

License:Apache License

public static void main(String[] args) throws Exception {
    long curTime = System.nanoTime();
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//from w  w  w  .j a v a 2s  .c  om
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }

    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);

    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }

    RerankerCascade cascade = new RerankerCascade();
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name,
                "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    } else {
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    }

    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));

    PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
    LOG.info("Writing output to " + searchArgs.output);

    LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.nanoTime();

        Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(),
                true, true);
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER,
                topic.getQuery());

        TopDocs rs = searcher.search(query, filter, searchArgs.hits);

        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(),
                Sets.newHashSet(AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery())), filter);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);

        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid,
                    docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i],
                    searchArgs.runtag));
        }
        long qtime = (System.nanoTime() - curQueryTime) / 1000000;
        LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
        totalTime += qtime;
        cnt++;
    }

    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");

    reader.close();
    out.close();
}

From source file:io.anserini.search.SearchWebCollection.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries//www .j a  va 2 s  .co  m
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */

public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits)
        throws IOException, ParseException {

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);

    final String runTag = "BM25_EnglishAnalyzer_" + FIELD_BODY + "_" + similarity.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));

    QueryParser queryParser = new QueryParser(FIELD_BODY, new EnglishAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.OR);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, numHits).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i + 1);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    out.flush();
    out.close();
}