Example usage for org.apache.lucene.search IndexSearcher setSimilarity

List of usage examples for org.apache.lucene.search IndexSearcher setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher setSimilarity.

Prototype

public void setSimilarity(Similarity similarity) 

Source Link

Document

Expert: Set the Similarity implementation used by this IndexSearcher.

Usage

From source file:fr.lipn.yasemir.ontology.annotation.SentenceBasedAnnotator.java

License:Open Source License

/**
 * Implementation of the annotate method by IndexBasedAnnotator.
 * /*from   ww  w .  j a  v a 2s  .c  o m*/
 * The input text is splitted in fragments according to punctuation;
 * every fragment is used as a query and sent to a Lucene SE that
 * was used to index the terminology (BM25 weight).
 * Up to the 20 top results returned by the system are taken as the annotation for the
 * fragment text. All the fragment annotations combined compose the document annotation
 * that is returned by this method.
 * 
 */
public DocumentAnnotation annotate(String document) {
    DocumentAnnotation ret = new DocumentAnnotation();

    try {
        IndexReader reader = IndexReader.open(FSDirectory.open(new File(termIndexPath)));
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(new BM25Similarity());

        /*
        document=document.replaceAll("\\[.*?\\]", "").trim();
        //document = document.replaceAll( "\\p{Punct}", " " );
        String [] fragments = document.split("[;:\\.,]");
        */

        String[] fragments = (String[]) getSentences(document).toArray();

        for (String ofragment : fragments) {
            ofragment = ofragment.replaceAll("\\p{Punct}", " ");
            ofragment = ofragment.trim();
            String sa[] = ofragment.split("(?<=[ \\n])");
            EnglishStemmer st = new EnglishStemmer();
            StringBuffer fbuf = new StringBuffer();
            for (String s : sa) {
                st.setCurrent(s.trim());
                st.stem();
                fbuf.append(st.getCurrent());
                fbuf.append(" ");
            }

            String fragment = fbuf.toString().trim(); //stemmed fragment

            if (fragment.length() == 0)
                continue;
            //System.err.println("Annotating: "+fragment);

            QueryParser parser = new QueryParser(Version.LUCENE_44, "labels", Yasemir.analyzer);
            Query query = parser.parse(fragment);
            String stemmedFragment = query.toString("labels").replaceAll("labels:", "");

            TopDocs results = searcher.search(query, 20);
            ScoreDoc[] hits = results.scoreDocs;

            int numTotalHits = results.totalHits;
            //System.err.println(numTotalHits + " total matching classes");

            if (numTotalHits > 0) {
                hits = searcher.search(query, numTotalHits).scoreDocs;
                for (int i = 0; i < Math.min(numTotalHits, MAX_ANNOTS); i++) {
                    Document doc = searcher.doc(hits[i].doc);
                    String ptrn = "(?i)(" + doc.get("labels").replaceAll(", ", "|") + ")";
                    //System.err.println("OWLClass="+doc.get("id")+" score="+hits[i].score);
                    if (Tools.checkPattern(stemmedFragment, ptrn)) {
                        //System.err.println("OK: OWLClass="+doc.get("id")+" score="+hits[i].score);
                        Annotation ann = new Annotation(doc.get("id"));
                        String ontoID = ann.getRelatedOntology().getOntologyID();

                        Vector<Annotation> annotations = ret.get(ontoID);
                        if (annotations == null)
                            annotations = new Vector<Annotation>();
                        annotations.add(ann);
                        ret.put(ontoID, annotations);
                    }
                }
            }

        }
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;

}

From source file:framework.retrieval.engine.query.RQuery.java

License:Apache License

/**
 * ?/*from   w w  w  .  j ava  2  s  .c om*/
 * @param analyzerFactory
 *            ?
 *            
 * @param highlighterFactory
 *            ?
 *            
 * @param queryResultTopDocsNum
 *            ?
 *            
 * @param baseIndexPath
 *            ?
 *            
 * @param indexReaders
 *            
 */
public RQuery(IRAnalyzerFactory analyzerFactory, IHighlighterFactory highlighterFactory,
        int queryResultTopDocsNum, String baseIndexPath, IndexReaderProxy[] indexReaderProxys) {
    this.analyzerFactory = analyzerFactory;
    this.highlighterFactory = highlighterFactory;
    this.queryResultTopDocsNum = queryResultTopDocsNum;
    this.indexReaderProxys = indexReaderProxys;

    int length = indexReaderProxys.length;
    if (length > 1) {
        IndexSearcher[] searchers = new IndexSearcher[length];
        for (int i = 0; i < length; i++) {
            IndexSearcher searcher = null;
            try {
                searcher = new IndexSearcher(indexReaderProxys[i].getIndexReader());
            } catch (Exception e) {
                throw new RetrievalQueryException(e);
            }
            searchers[i] = searcher;
        }
        try {
            searcher = new MultiSearcher(searchers);
        } catch (Exception e) {
            throw new RetrievalQueryException(e);
        }
    } else {
        try {
            searcher = new IndexSearcher(indexReaderProxys[0].getIndexReader());
        } catch (Exception e) {
            throw new RetrievalQueryException(e);
        }
    }
    searcher.setSimilarity(analyzerFactory.createSimilarity());
}

From source file:HW1.SearchFiles.java

License:Apache License

public static void main(String[] args) throws Exception {
    String queryString = "dislike football";

    String indexPath = "/Users/yangyang/Desktop/lucene/hw1/index/index04";
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));

    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer();
    searcher.setSimilarity(new BM25Similarity());

    QueryParser parser = new QueryParser("TEXT", analyzer);
    Query query = parser.parse(queryString);
    System.out.println("Searching for: " + query.toString("TEXT"));

    TopDocs results = searcher.search(query, 10);
    ScoreDoc[] hits = results.scoreDocs;

    int numTotalHits = results.totalHits;
    System.out.println(numTotalHits + " total matching documents");

    for (int i = 0; i < hits.length; i++) {
        Document doc = searcher.doc(hits[i].doc);
        System.out.println("DOCNO: " + doc.get("DOCNO"));
    }/*from w  w  w .j av  a2s  . c om*/

    reader.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        String docid = reader.document(i).getField("docid").stringValue();
        System.out.println(reader.document(i));
        System.out.println(i + ": " + docid);
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        while (te.next() != null) {
            String term = new Term("text", te.term()).bytes().utf8ToString();
            long tf = te.totalTermFreq();

            // The way to compute the BM25 score is to issue a query with the exact docid and the
            // term in question, and look at the retrieval score.
            Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid
            Query termQuery = new TermQuery(new Term("text", term)); // the term
            BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both
            builder.add(filterQuery, BooleanClause.Occur.MUST);
            builder.add(termQuery, BooleanClause.Occur.MUST);
            Query finalQuery = builder.build();
            TopDocs rs = searcher.search(finalQuery, 1); // issue the query

            // The BM25 weight is the maxScore
            System.out.println(term + " " + tf + " " + rs.getMaxScore());
        }/*  w ww.  ja va 2 s .c o  m*/
    }
}

From source file:io.anserini.qa.RetrieveSentences.java

License:Apache License

public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits)
        throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);

    //using BM25 scoring model
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    searcher.setSimilarity(similarity);

    EnglishAnalyzer ea = new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Map<String, Float> scoredDocs = new LinkedHashMap<>();

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);

        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;/* w  w w .  ja v a  2  s.c  o m*/
        ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);

        for (int i = 0; i < docs.documents.length; i++) {
            scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
        }
    }
    return scoredDocs;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * If the external reranking context is not null we will first search against the external
 * index and return the top ranked documents.
 *
 * @param docs The initial ranking results against target index. We will return them if external
 *             index is null./*from w  ww.ja v  a 2  s  .  c  o  m*/
 *
 * @return Top ranked ScoredDocuments from searching external index
 */
private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerContext<T> context)
        throws IOException {
    if (externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(context.getIndexSearcher().getSimilarity(true));

        SearchArgs args = new SearchArgs();
        args.hits = this.R;
        args.arbitraryScoreTieBreak = context.getSearchArgs().arbitraryScoreTieBreak;
        args.searchtweets = context.getSearchArgs().searchtweets;

        RerankerContext<T> externalContext = new RerankerContext<>(searcher, context.getQueryId(),
                context.getQuery(), context.getQueryText(), context.getQueryTokens(), context.getFilter(),
                args);

        return searchTopDocs(null, externalContext);
    } else {
        return docs;
    }
}

From source file:io.anserini.search.SearchClueWeb09b.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt
 * @param operator   Default search operator: AND or OR
 * @throws IOException/*from w  w w  .j  av  a 2s .c  o  m*/
 * @throws ParseException
 */

public void search(String topicsFile, String submissionFile, QueryParser.Operator operator)
        throws IOException, ParseException {

    Path topicsPath = Paths.get(topicsFile);

    if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));

    QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer());
    queryParser.setDefaultOperator(operator);

    SortedMap<Integer, String> topics = readQueries(topicsPath);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    out.flush();
    out.close();
}

From source file:io.anserini.search.SearchCollection.java

License:Apache License

@SuppressWarnings("unchecked")
public <K> int runTopics() throws IOException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);

    Path topicsFile = Paths.get(args.topics);

    if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }/*www . j a  v a 2s  .  c  o  m*/

    TopicReader<K> tr;
    SortedMap<K, Map<String, String>> topics;
    try {
        tr = (TopicReader<K>) Class.forName("io.anserini.search.query." + args.topicReader + "TopicReader")
                .getConstructor(Path.class).newInstance(topicsFile);
        topics = tr.read();
    } catch (Exception e) {
        throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader);
    }

    final String runTag = "Anserini_" + args.topicfield + "_" + (args.keepstop ? "KeepStopwords_" : "")
            + FIELD_BODY + "_" + (args.searchtweets ? "SearchTweets_" : "") + similarity.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(args.output), StandardCharsets.US_ASCII));

    for (Map.Entry<K, Map<String, String>> entry : topics.entrySet()) {
        K qid = entry.getKey();
        String queryString = entry.getValue().get(args.topicfield);

        ScoredDocuments docs;
        if (args.searchtweets) {
            docs = searchTweets(searcher, qid, queryString, Long.parseLong(entry.getValue().get("time")));
        } else {
            docs = search(searcher, qid, queryString);
        }

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < docs.documents.length; i++) {
            out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid,
                    docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i],
                    ((i == 0 || i == docs.documents.length - 1) ? runTag : "See_Line1")));
        }
    }
    out.flush();
    out.close();

    return topics.size();
}

From source file:io.anserini.search.SearchTweets.java

License:Apache License

public static void main(String[] args) throws Exception {
    long curTime = System.nanoTime();
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//from w  w  w  .j a v a 2s  .c  om
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }

    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);

    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }

    RerankerCascade cascade = new RerankerCascade();
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name,
                "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    } else {
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    }

    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));

    PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
    LOG.info("Writing output to " + searchArgs.output);

    LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.nanoTime();

        Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(),
                true, true);
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER,
                topic.getQuery());

        TopDocs rs = searcher.search(query, filter, searchArgs.hits);

        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(),
                Sets.newHashSet(AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery())), filter);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);

        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid,
                    docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i],
                    searchArgs.runtag));
        }
        long qtime = (System.nanoTime() - curQueryTime) / 1000000;
        LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
        totalTime += qtime;
        cnt++;
    }

    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");

    reader.close();
    out.close();
}

From source file:io.anserini.search.SearchWebCollection.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries//www .j a  va 2 s  .co  m
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */

public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits)
        throws IOException, ParseException {

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);

    final String runTag = "BM25_EnglishAnalyzer_" + FIELD_BODY + "_" + similarity.toString();

    PrintWriter out = new PrintWriter(
            Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));

    QueryParser queryParser = new QueryParser(FIELD_BODY, new EnglishAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.OR);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, numHits).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i + 1);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    out.flush();
    out.close();
}