it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval.java Source code

Java tutorial

Introduction

Here is the source code for it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval.java

Source

package it.unipd.dei.ims.lucene.clef.applications;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import it.unipd.dei.ims.lucene.clef.AnalyzerFactory;
import it.unipd.dei.ims.lucene.clef.parser.ClefQQParser;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.benchmark.quality.QualityBenchmark;
import org.apache.lucene.benchmark.quality.QualityQuery;
import org.apache.lucene.benchmark.quality.QualityQueryParser;
import org.apache.lucene.benchmark.quality.QualityStats;
import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

/**
 *
 *  Functionalities to perform the evaluation of ad-hoc test collections.
 *
 */
public class BatchRetrieval {

    static Logger logger = LoggerFactory.getLogger(BatchRetrieval.class);

    public static void main(String[] args) {

        Properties properties = new Properties();
        InputStream input = null;
        try {
            if (System.getProperty("properties.path") != null) {
                input = new FileInputStream(System.getProperty("properties.path"));
                properties.load(input);
            } else {
                logger.info("Loading default property file [resources/lucene-clef.properties]");
                ClassLoader loader = Thread.currentThread().getContextClassLoader();
                input = loader.getResourceAsStream("lucene-clef.properties");
                properties.load(input);
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        } finally {
            if (input != null) {
                try {
                    input.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        properties.putAll(System.getProperties());

        Path indexPath = new File(properties.getProperty("index.path")).toPath();

        Path runPath = new File(properties.getProperty("run.path")).toPath();

        String runTag = properties.getProperty("run.tag");

        String language = properties.getProperty("language");

        String stemmer = properties.getProperty("stemmer");

        String stopsetType = properties.getProperty("stopset.type");

        String stopsetPath = properties.getProperty("stopset.path");

        try {

            Directory directory = new SimpleFSDirectory(indexPath);
            IndexReader reader = DirectoryReader.open(directory);
            IndexSearcher searcher = new IndexSearcher(reader);

            String model = properties.getProperty("run.model").toUpperCase();

            Similarity similarity;

            switch (model) {
            case "BM25":
                similarity = new BM25Similarity(Float.parseFloat(properties.getProperty("bm25.k1", "1.2f")),
                        Float.parseFloat(properties.getProperty("bm25.b", "0.75f")));
                break;
            default:
                throw new UnsupportedOperationException("Model " + model + " not supported yet");

            }

            searcher.setSimilarity(similarity);

            int maxResults = Integer.parseInt(properties.getProperty("maxresults", "1000"));

            SubmissionReport runfile = new SubmissionReport(
                    new PrintWriter(Files.newBufferedWriter(runPath, StandardCharsets.UTF_8)), model);

            String topicPath = properties.getProperty("topics.path");

            String[] topicFields = properties.getProperty("topics.fields").split(";");

            CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

            QualityQuery qqs[] = getQualityQueries(topicPath, topicFields);

            QualityQueryParser qqParser = new ClefQQParser(topicFields, BuildIndex.BODY_FIELD_NAME, language,
                    stemmer, stopset);

            // run the benchmark
            QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, BuildIndex.ID_FIELD_NAME);

            qrun.setMaxResults(maxResults);

            QualityStats stats[] = qrun.execute(null, runfile, null);

            reader.close();
            directory.close();

        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    public static QualityQuery[] getQualityQueries(String topicPath, String[] topicFields) throws Exception {

        File fXmlFile = new File(topicPath);
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = null;
        try {
            dBuilder = dbFactory.newDocumentBuilder();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
            throw new Exception("ParserConfigurationException when parsing topic file " + topicPath);
        }
        org.w3c.dom.Document doc = null;
        try {
            doc = dBuilder.parse(fXmlFile);
        } catch (SAXException e) {
            e.printStackTrace();
            throw new Exception("SAXException when parsing topic file " + topicPath);
        } catch (IOException e) {
            e.printStackTrace();
            throw new Exception("IOException when parsing topic file " + topicPath);
        }

        doc.getDocumentElement().normalize();

        NodeList nList = doc.getElementsByTagName("topic");

        QualityQuery[] qqs = new QualityQuery[nList.getLength()];

        for (int i = 0; i < nList.getLength(); i++) {

            Node nNode = nList.item(i);

            if (nNode.getNodeType() == Node.ELEMENT_NODE) {

                Element eElement = (Element) nNode;

                String queryId = eElement.getElementsByTagName("identifier").item(0).getTextContent();
                Map<String, String> queryFields = new HashMap<String, String>();
                for (String topicField : topicFields) {
                    queryFields.put(topicField, eElement.getElementsByTagName(topicField).item(0).getTextContent());
                }

                qqs[i] = new QualityQuery(queryId, queryFields);

            }

        }

        return qqs;

    }

}