Example usage for org.apache.lucene.search.similarities BM25Similarity BM25Similarity

List of usage examples for org.apache.lucene.search.similarities BM25Similarity BM25Similarity

Introduction

In this page you can find the example usage for org.apache.lucene.search.similarities BM25Similarity BM25Similarity.

Prototype

public BM25Similarity(float k1, float b) 

Source Link

Document

BM25 with the supplied parameter values.

Usage

From source file:apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;//  w w w.ja v a2 s  .c  om
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}

From source file:apps.LuceneQuery.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("d", null, true, "index directory");
    options.addOption("i", null, true, "input file");
    options.addOption("s", null, true, "stop word file");
    options.addOption("n", null, true, "max # of results");
    options.addOption("o", null, true, "a TREC-style output file");
    options.addOption("r", null, true, "an optional QREL file, if specified,"
            + "we save results only for queries for which we find at least one relevant entry.");

    options.addOption("prob", null, true, "question sampling probability");
    options.addOption("max_query_qty", null, true, "a maximum number of queries to run");
    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    options.addOption("seed", null, true, "random seed");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "query source type: " + commaJoin.join(SourceFactory.getQuerySourceList()));

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    QrelReader qrels = null;/*ww w . ja  va  2 s .  co  m*/

    try {

        CommandLine cmd = parser.parse(options, args);

        String indexDir = null;

        if (cmd.hasOption("d")) {
            indexDir = cmd.getOptionValue("d");
        } else {
            Usage("Specify 'index directory'", options);
        }

        String inputFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        DictNoComments stopWords = null;

        if (cmd.hasOption("s")) {
            String stopWordFileName = cmd.getOptionValue("s");
            stopWords = new DictNoComments(new File(stopWordFileName), true /* lowercasing */);
            System.out.println("Using the stopword file: " + stopWordFileName);
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        int numRet = 100;

        if (cmd.hasOption("n")) {
            numRet = Integer.parseInt(cmd.getOptionValue("n"));
            System.out.println("Retrieving at most " + numRet + " candidate entries.");
        }

        String trecOutFileName = null;

        if (cmd.hasOption("o")) {
            trecOutFileName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'a TREC-style output file'", options);
        }

        double fProb = 1.0f;

        if (cmd.hasOption("prob")) {
            try {
                fProb = Double.parseDouble(cmd.getOptionValue("prob"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'question sampling probability'", options);
            }
        }

        if (fProb <= 0 || fProb > 1) {
            Usage("Question sampling probability should be >0 and <=1", options);
        }

        System.out.println("Sample the following fraction of questions: " + fProb);

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        long seed = 0;

        String tmpl = cmd.getOptionValue("seed");

        if (tmpl != null)
            seed = Long.parseLong(tmpl);

        System.out.println("Using seed: " + seed);

        Random randGen = new Random(seed);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        Similarity similarity = null;

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            similarity = new BM25SimilarityFix(bm25_k1, bm25_b);
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            similarity = new BM25Similarity(bm25_k1, bm25_b);
        }

        int maxQueryQty = Integer.MAX_VALUE;

        if (cmd.hasOption("max_query_qty")) {
            try {
                maxQueryQty = Integer.parseInt(cmd.getOptionValue("max_query_qty"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'max_query_qty'", options);
            }
        }

        System.out.println(String.format("Executing at most %d queries", maxQueryQty));

        if (cmd.hasOption("r")) {
            String qrelFile = cmd.getOptionValue("r");
            System.out.println("Using the qrel file: '" + qrelFile
                    + "', queries not returning a relevant entry will be ignored.");
            qrels = new QrelReader(qrelFile);
        }

        System.out.println(String.format("Using indexing directory %s", indexDir));

        LuceneCandidateProvider candProvider = new LuceneCandidateProvider(indexDir, analyzer, similarity);
        TextCleaner textCleaner = new TextCleaner(stopWords);

        QuerySource inpQuerySource = SourceFactory.createQuerySource(sourceName, inputFileName);
        QueryEntry inpQuery = null;

        BufferedWriter trecOutFile = new BufferedWriter(new FileWriter(new File(trecOutFileName)));

        int questNum = 0, questQty = 0;

        long totalTimeMS = 0;

        while ((inpQuery = inpQuerySource.next()) != null) {
            if (questQty >= maxQueryQty)
                break;
            ++questNum;

            String queryID = inpQuery.mQueryId;

            if (randGen.nextDouble() <= fProb) {
                ++questQty;

                String tokQuery = spaceJoin.join(textCleaner.cleanUp(inpQuery.mQueryText));
                String query = TextCleaner.luceneSafeCleanUp(tokQuery).trim();

                ResEntry[] results = null;

                if (query.isEmpty()) {
                    results = new ResEntry[0];
                    System.out.println(String.format("WARNING, empty query id = '%s'", inpQuery.mQueryId));
                } else {

                    try {
                        long start = System.currentTimeMillis();

                        results = candProvider.getCandidates(questNum, query, numRet);

                        long end = System.currentTimeMillis();
                        long searchTimeMS = end - start;
                        totalTimeMS += searchTimeMS;

                        System.out.println(String.format(
                                "Obtained results for the query # %d (answered %d queries), queryID %s the search took %d ms, we asked for max %d entries got %d",
                                questNum, questQty, queryID, searchTimeMS, numRet, results.length));

                    } catch (ParseException e) {
                        e.printStackTrace();
                        System.err.println(
                                "Error parsing query: " + query + " orig question is :" + inpQuery.mQueryText);
                        System.exit(1);
                    }
                }

                boolean bSave = true;

                if (qrels != null) {
                    boolean bOk = false;
                    for (ResEntry r : results) {
                        String label = qrels.get(queryID, r.mDocId);
                        if (candProvider.isRelevLabel(label, 1)) {
                            bOk = true;
                            break;
                        }
                    }
                    if (!bOk)
                        bSave = false;
                }

                //            System.out.println(String.format("Ranking results the query # %d queryId='%s' save results? %b", 
                //                                              questNum, queryID, bSave));          
                if (bSave) {
                    saveTrecResults(queryID, results, trecOutFile, TREC_RUN, numRet);
                }
            }

            if (questNum % 1000 == 0)
                System.out.println(String.format("Proccessed %d questions", questNum));

        }

        System.out.println(String.format("Proccessed %d questions, the search took %f MS on average", questQty,
                (float) totalTimeMS / questQty));

        trecOutFile.close();

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments: " + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    }
}

From source file:io.anserini.qa.RetrieveSentences.java

License:Apache License

public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits)
        throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);

    //using BM25 scoring model
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    searcher.setSimilarity(similarity);/* w  ww  .  j a  v a  2 s .  c  om*/

    EnglishAnalyzer ea = new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Map<String, Float> scoredDocs = new LinkedHashMap<>();

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);

        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);

        for (int i = 0; i < docs.documents.length; i++) {
            scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
        }
    }
    return scoredDocs;
}

From source file:io.anserini.search.SearchCollection.java

License:Apache License

public SearchCollection(SearchArgs args) throws IOException {
    this.args = args;
    Path indexPath = Paths.get(args.index);

    if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
        throw new IllegalArgumentException(args.index + " does not exist or is not a directory.");
    }/*from  ww  w  . j ava2  s. c  o  m*/

    LOG.info("Reading index at " + args.index);
    this.reader = DirectoryReader.open(FSDirectory.open(indexPath));

    // Figure out which scoring model to use.
    if (args.ql) {
        LOG.info("Using QL scoring model");
        this.similarity = new LMDirichletSimilarity(args.mu);
    } else if (args.bm25) {
        LOG.info("Using BM25 scoring model");
        this.similarity = new BM25Similarity(args.k1, args.b);
    } else if (args.f2log) {
        LOG.info("Using F2Log scoring model");
        this.similarity = new F2LogSimilarity(args.f2log_s);
    } else {
        throw new IllegalArgumentException("Error: Must specify scoring model!");
    }

    // Are we searching tweets?
    if (args.searchtweets) {
        analyzer = new TweetAnalyzer();
    } else {
        analyzer = args.keepstop ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    }

    isRerank = args.rm3 || args.axiom;

    // Set up the ranking cascade.
    cascade = new RerankerCascade();
    if (args.rm3) {
        cascade.add(new Rm3Reranker(analyzer, FIELD_BODY, args));
    } else if (args.axiom) {
        cascade.add(new AxiomReranker(FIELD_BODY, args));
    }

    cascade.add(new ScoreTiesAdjusterReranker());
}

From source file:io.anserini.search.SearchTweets.java

License:Apache License

public static void main(String[] args) throws Exception {
    long curTime = System.nanoTime();
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));

    try {// www  .  j a v a2s. c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }

    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);

    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }

    RerankerCascade cascade = new RerankerCascade();
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name,
                "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    } else {
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    }

    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));

    PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
    LOG.info("Writing output to " + searchArgs.output);

    LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.nanoTime();

        Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(),
                true, true);
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER,
                topic.getQuery());

        TopDocs rs = searcher.search(query, filter, searchArgs.hits);

        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(),
                Sets.newHashSet(AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery())), filter);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);

        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid,
                    docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i],
                    searchArgs.runtag));
        }
        long qtime = (System.nanoTime() - curQueryTime) / 1000000;
        LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
        totalTime += qtime;
        cnt++;
    }

    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");

    reader.close();
    out.close();
}

From source file:io.anserini.search.SearchWebCollection.java

License:Apache License

public static void main(String[] args) throws IOException, ParseException {

    long curTime = System.nanoTime();
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//from ww  w . j  a  va  2 s  .c o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }

    Similarity similarity = null;

    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        similarity = new LMDirichletSimilarity(searchArgs.mu);
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }

    RerankerCascade cascade = new RerankerCascade();
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(new EnglishAnalyzer(), "body",
                "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
    } else {
        cascade.add(new IdentityReranker());
    }

    Path topicsFile = Paths.get(searchArgs.topics);

    if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }

    SortedMap<Integer, String> topics = io.anserini.document.Collection.GOV2.equals(searchArgs.collection)
            ? readTeraByteTackQueries(topicsFile)
            : readWebTrackQueries(topicsFile);

    SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
    searcher.search(topics, searchArgs.output, similarity, searchArgs.hits);
    searcher.close();
}

From source file:io.anserini.util.SearchTimeUtil.java

License:Apache License

public static void main(String[] args) throws IOException, ParseException {

    if (args.length != 1) {
        System.err.println("Usage: SearchTimeUtil <indexDir>");
        System.err.println("indexDir: index directory");
        System.exit(1);/*from  w  w w. j a v  a 2s . c o  m*/
    }

    String[] topics = { "topics.web.1-50.txt", "topics.web.51-100.txt", "topics.web.101-150.txt",
            "topics.web.151-200.txt", "topics.web.201-250.txt", "topics.web.251-300.txt" };

    SearchWebCollection searcher = new SearchWebCollection(args[0]);

    for (String topicFile : topics) {
        Path topicsFile = Paths.get("src/resources/topics-and-qrels/", topicFile);
        SortedMap<Integer, String> queries = SearchWebCollection.readWebTrackQueries(topicsFile);
        for (int i = 1; i <= 3; i++) {
            final long start = System.nanoTime();
            String submissionFile = File.createTempFile(topicFile + "_" + i, ".tmp").getAbsolutePath();
            searcher.search(queries, submissionFile, new BM25Similarity(0.9f, 0.4f), 1000);
            final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start,
                    TimeUnit.NANOSECONDS);
            System.out.println(topicFile + "_" + i + " search completed in "
                    + DurationFormatUtils.formatDuration(durationMillis, "mm:ss:SSS"));
        }
    }

    searcher.close();
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;//www .  j  av a  2 s .c  o m
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    Path indexPath = new File(properties.getProperty("index.path")).toPath();

    Path runPath = new File(properties.getProperty("run.path")).toPath();

    String runTag = properties.getProperty("run.tag");

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = properties.getProperty("stopset.path");

    try {

        Directory directory = new SimpleFSDirectory(indexPath);
        IndexReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);

        String model = properties.getProperty("run.model").toUpperCase();

        Similarity similarity;

        switch (model) {
        case "BM25":
            similarity = new BM25Similarity(Float.parseFloat(properties.getProperty("bm25.k1", "1.2f")),
                    Float.parseFloat(properties.getProperty("bm25.b", "0.75f")));
            break;
        default:
            throw new UnsupportedOperationException("Model " + model + " not supported yet");

        }

        searcher.setSimilarity(similarity);

        int maxResults = Integer.parseInt(properties.getProperty("maxresults", "1000"));

        SubmissionReport runfile = new SubmissionReport(
                new PrintWriter(Files.newBufferedWriter(runPath, StandardCharsets.UTF_8)), model);

        String topicPath = properties.getProperty("topics.path");

        String[] topicFields = properties.getProperty("topics.fields").split(";");

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        QualityQuery qqs[] = getQualityQueries(topicPath, topicFields);

        QualityQueryParser qqParser = new ClefQQParser(topicFields, BuildIndex.BODY_FIELD_NAME, language,
                stemmer, stopset);

        // run the benchmark
        QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, BuildIndex.ID_FIELD_NAME);

        qrun.setMaxResults(maxResults);

        QualityStats stats[] = qrun.execute(null, runfile, null);

        reader.close();
        directory.close();

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:NewsIR_search.NewsIRSearcher.java

private void setSimilarityFn_ResFileName(int indexcounter) throws IOException {

    searcher = new IndexSearcher(reader);
    float bm25_k1, bm25_b, lm_jm_lambda, lm_d_mu;

    switch (setSimilarityFlag.toLowerCase()) {
    case "bm25":
        bm25_k1 = Float.parseFloat(prop.getProperty("bm25-k1"));
        bm25_b = Float.parseFloat(prop.getProperty("bm25-b"));

        System.out.println("Setting BM25 with k1: " + bm25_k1 + " b: " + bm25_b);
        searcher.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));

        run_name = "bm25-k1=" + bm25_k1 + "-b=" + bm25_b + "-" + num_ret;
        break;//from   w w w .  j av  a 2 s  .c  om
    case "lm-jm":
        lm_jm_lambda = Float.parseFloat(prop.getProperty("lm_jm_lambda"));

        System.out.println("Setting LMJelinekMercer with lambda: " + lm_jm_lambda);
        searcher.setSimilarity(new LMJelinekMercerSimilarity(lm_jm_lambda));

        run_name = "lm-jm-lambda" + lm_jm_lambda;
        break;
    case "lm-d":
        lm_d_mu = Float.parseFloat(prop.getProperty("lm_d_mu"));

        System.out.println("Setting LMDirichlet with mu: " + lm_d_mu);
        searcher.setSimilarity(new LMDirichletSimilarity(lm_d_mu));

        run_name = "lm-d-mu=" + lm_d_mu + "-";
        break;

    case "default":
        System.out.println("Setting DefaultSimilarity of Lucene: ");
        searcher.setSimilarity(new DefaultSimilarity());
        run_name = "default-lucene";
        break;

    default:
        System.out.println("Setting DefaultSimilarity of Lucene: ");
        searcher.setSimilarity(new DefaultSimilarity());
        run_name = "default-lucene";
        break;
    }

    resultsFile = run_name;

}

From source file:nl.uva.DataHandler.Indexing.java

@Override
public void Indexer(String indexPathString) throws Exception, Throwable {
    try {//from w w w  .j  av a2s .  com
        log.info(
                "----------------------- INDEXING - Override Version: for BM25 :)  --------------------------");

        Path ipath = FileSystems.getDefault().getPath(indexPathString);
        super.IndexesCleaner(indexPathString);
        MyAnalyzer myAnalyzer;
        if (!super.stopWordsRemoving)
            myAnalyzer = new MyAnalyzer(super.stemming);
        else
            myAnalyzer = new MyAnalyzer(super.stemming, super.LoadStopwords());

        Analyzer analyzer = myAnalyzer.getAnalyzer(configFile.getProperty("CORPUS_LANGUAGE"));
        PerFieldAnalyzerWrapper prfWrapper = new PerFieldAnalyzerWrapper(analyzer, super.analyzerMap);
        IndexWriterConfig irc = new IndexWriterConfig(prfWrapper);
        irc.setSimilarity(new BM25Similarity(1.2F, 0.75F));
        this.writer = new IndexWriter(new SimpleFSDirectory(ipath), irc);
        this.docIndexer();
        this.writer.commit();
        this.writer.close();
        analyzer.close();
        prfWrapper.close();
        log.info("-------------------------------------------------");
        log.info("Index is created successfully...");
        log.info("-------------------------------------------------");

    } catch (Exception ex) {
        log.error(ex);
        throw ex;
    }
}