de.uniheidelberg.cl.swp.mlprocess.MLProcess.java Source code

Introduction

Here is the source code for de.uniheidelberg.cl.swp.mlprocess.MLProcess.java
Source

/*
 * ELAC: Ensemble Learning for Anaphora- and Coreference-Resolution-Systems
 * package: de.uniheidelberg.cl.swp.mlprocess
 * class: MLProcess
 * 
 * Authors: E-Mail
 * Thomas Boegel: boegel@cl.uni-heidelberg.de
 * Lukas Funk: funk@cl.uni-heidelberg.de
 * Andreas Kull: kull@cl.uni-heidelberg.de
 * 
 * Please find a detailed explanation of this particular class/package and its role and usage at
 * the first JavaDoc following this comment.
 * 
 * Copyright 2010 Thomas Boegel & Lukas Funk & Andreas Kull
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.uniheidelberg.cl.swp.mlprocess;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import weka.core.Instance;
import weka.core.Instances;
import de.uniheidelberg.cl.swp.datastructure.CoreferencePair;
import de.uniheidelberg.cl.swp.featureExtraction.FeatureExtractionProcess;
import de.uniheidelberg.cl.swp.io.ACRResultWriter;
import de.uniheidelberg.cl.swp.io.Configuration;
import de.uniheidelberg.cl.swp.testacr.Evaluation;
import de.uniheidelberg.cl.swp.testacr.PerformanceMeasurementMain;
import de.uniheidelberg.cl.swp.testacr.Runner;
import de.uniheidelberg.cl.swp.util.MMAXParser;

/**
 * Entry point for the testing process.
 * <br>
 * <p>Extracts the results of all ACR-Systems, builds an classifier based on the training and save
 * the positive classifications of the classifier to a list. Afterwards the classifier will be
 * evaluated against the gold standard.</p>
 */
public class MLProcess {
    private WEKARunner wr;
    private Configuration config;
    private InstanceContainer ic;
    private ACRResultWriter writer;
    private Evaluation eval;
    private FeatureExtractionProcess fep;
    private String options = "";

    /**
     * The constructor initializes the necessary data structures and reads the config options.
     * <br>
     * Uses default ARFF file which is created during the training process.
     * 
     * @throws If something did go wrong with the testing process.
     */
    public MLProcess() throws Exception {
        config = Configuration.getInstance();
        wr = new WEKARunner(this.config.getProperties().getProperty("ResultOutputDir") + "results.arff");

        initWEKA();

        writer = new ACRResultWriter("MLProcess", config);
        eval = new Evaluation();
        fep = new FeatureExtractionProcess();
    }

    /**
     * The constructor initializes the necessary data structures and reads the config options.
     * <br>
     * Used for external ARFF files.
     * 
     * @param ARFF External ARFF file.
     * @throws If something did go wrong with the testing process.
     */
    public MLProcess(String ARFF) throws Exception {
        config = Configuration.getInstance();
        wr = new WEKARunner(ARFF);

        initWEKA();

        writer = new ACRResultWriter("MLProcess", config);
        eval = new Evaluation();
        fep = new FeatureExtractionProcess();
    }

    /**
     * The constructor initializes the necessary data structures and reads the config options.
     * <br>
     * Used for internal purposes.
     * 
     * @param ins Instances object.
     * @throws If something did go wrong with the testing process.
     */
    public MLProcess(Instances ins) throws Exception {
        config = Configuration.getInstance();
        wr = new WEKARunner(ins);

        initWEKA();

        writer = new ACRResultWriter("MLProcess", config);
        eval = new Evaluation();
        fep = new FeatureExtractionProcess();
    }

    /**
     * Sets the options for classifiers from the configuration and initializes them.
     * <br>
     * The related config options are: <br>
     * classifier: e.g. ADABOOST <br>
     * subclassifier: e.g. J48 <br>
     * stacking: true <br>
     * options: e.g. -N 4 -R<br>
     * <br>
     * Please consider the documentation for further information.
     * 
     * @throws If WEKA couldn't be initialized with the classifier.
     **/
    private void initWEKA() throws Exception {
        /* do we have options for our classifiers? */
        if (this.config.getProperties().getProperty("options") != null
                && this.config.getProperties().getProperty("options") != "") {
            options = this.config.getProperties().getProperty("options");
        }

        /* do we have a subclassifier? */
        if (config.getProperties().getProperty("subclassifier") != null
                && config.getProperties().getProperty("subclassifier") != "") {

            /* do we have stacking enabled? */
            if (config.getProperties().getProperty("stacking").equals("true")) {
                wr.run(config.getProperties().getProperty("classifier"), config.getStackingClassifiers(), options);
            } else {
                wr.run(config.getProperties().getProperty("classifier"),
                        config.getProperties().getProperty("subclassifier"), options);
            }
        } else {
            wr.run(config.getProperties().getProperty("classifier"), options);
        }
    }

    /**
     * Runs the ACR-Systems on the test corpus and extracts the {@link CoreferencePair}s.
     * 
     * @param mmaxParser A parser object to parse the MMAX files in the test corpus.
     * @return Mapping with the testcoreferences and the related ACR-System.
     */
    private Map<String, List<CoreferencePair>> extractTestCorefs(MMAXParser mmaxParser) throws Exception {
        Map<String, List<CoreferencePair>> testCorefs = new HashMap<String, List<CoreferencePair>>();

        /* temporary container to convert our coreferences to a WEKA-compatible format */
        ic = new InstanceContainer();
        ic.createAttributeStructure(fep.getFeatureExtractorList());

        for (Runner acrsys : config.getAcRRunner()) {
            acrsys.init(mmaxParser);
            acrsys.run();

            List<CoreferencePair> predictionList = MMAXParser.splitCorefChainsToPairs(acrsys.getCoreferents());

            fep.extractFeatures(predictionList, mmaxParser);

            if (testCorefs.containsKey(acrsys.getClass().getCanonicalName())) {
                testCorefs.get(acrsys.getClass().getCanonicalName()).addAll(predictionList);
            } else {
                testCorefs.put(acrsys.getClass().getCanonicalName(), predictionList);
            }

        }
        return testCorefs;
    }

    /**
     * Creates the classifications of the test-{@link CoreferencePair}s by using the classifier
     * trained on the test-{@link CoreferencePair}s.
     * 
     * @param testCorefs {@link CoreferencePair}s extraced from the test corpus by the ACR-Systems.
     * @return {@link CoreferencePair} which are predicted by our classifier to be correct.
     */
    private List<CoreferencePair> createPrediction(Map<String, List<CoreferencePair>> testCorefs) throws Exception {
        List<CoreferencePair> predictions = new ArrayList<CoreferencePair>();
        for (String s : testCorefs.keySet()) {
            for (final CoreferencePair cp : testCorefs.get(s)) {
                Instance ini = ic.addCorefInstance(cp, s);
                ini.setDataset(ic.getInstances());

                /* use the classifier to select a label */
                if (wr.labelUnknownInstance(ini) == 0.0) {
                    cp.setAcrSystem(ini.stringValue(ini.numAttributes() - 2));
                    predictions.add(cp);
                }
            }
        }
        predictions = removeDuplicates(predictions);
        return predictions;
    }

    /**
     * Evaluates the predicted {@link CoreferencePair}s from our classifier against the gold
     * standard.
     * 
     * @param mlPredictions Predictions from our classifier.
     * @param goldCorefPairs Gold standard.
     * @param fileName Results of the evaluation.
     */
    private void doEvaluation(List<CoreferencePair> mlPredictions, List<CoreferencePair> goldCorefPairs,
            String fileName) {
        eval.evaluateCorefPairs(mlPredictions, goldCorefPairs);

        try {
            writer.writeCorefResults(fileName, eval.getTruePositiveList(), eval.getFalseNegativeList(),
                    eval.getFalsePositiveList());
        } catch (IOException e) {
            System.err.println("Couldn't write output file.");
            e.printStackTrace();
        }
    }

    /**
     * Removes duplicates in our list.
     * 
     * @param corefList list of our predicted {@link CoreferencePair}s with duplicates.
     * @return list of our predicted {@link CoreferencePair}s without duplicates.
     */
    private List<CoreferencePair> removeDuplicates(List<CoreferencePair> corefList) {
        int count = 0;
        List<CoreferencePair> targetList = new ArrayList<CoreferencePair>();

        for (CoreferencePair coreferencePair : corefList) {
            count = 0;
            for (CoreferencePair coreferencePair2 : corefList) {
                if (coreferencePair2.equals(coreferencePair)) {
                    count += 1;
                    continue;
                }
            }
            if (count < 2) {
                targetList.add(coreferencePair);
            }
        }
        return targetList;
    }

    /**
     * Entry point for external callers.
     * 
     * <p>First the test corpus will be processed by the ACR-Systems. Afterwards the gold standard
     * and features will be extracted. A Instances container will be created and the
     * classifications of our classifier will be made. The predictions of our classifier will be
     * evaluated against the gold standard and important statistical data will be saved to a
     * file.</p>
     * 
     * @throws Exception If an error occurred during the MLProcess. 
     */
    public void run() throws Exception {
        List<String> mmaxTestFiles = PerformanceMeasurementMain.getMMAxFiles("Test");

        for (String mmaxTestFile : mmaxTestFiles) {
            MMAXParser mmaxParser = MMAXParser.processMMAXFile(mmaxTestFile);

            /* gold standard */
            List<CoreferencePair> goldStandard = mmaxParser.getCorefPairs();

            /* feature extraction for the attribute structure */
            fep.extractFeatures(goldStandard, mmaxParser);

            ic = new InstanceContainer();
            ic.createAttributeStructure(fep.getFeatureExtractorList());

            /* test coreferences */
            Map<String, List<CoreferencePair>> testList = extractTestCorefs(mmaxParser);

            /* positive predictions */
            List<CoreferencePair> predictions = createPrediction(testList);

            //predictions = removeDuplicates(predictions);
            doEvaluation(predictions, goldStandard, mmaxTestFile);
        }
        this.writer.writeOverallResult(eval);

        String ResultOutputDir = Configuration.getInstance().getProperties().getProperty("ResultOutputDir");

        System.out.println(
                "\n\n######\nTesting process finished.\nResults written to " + ResultOutputDir + "\n######\n");
    }

    /**
     * Entry point for external callers with the possibility to specify FeatureSets and results.
     * This method is useful to save runtime, if the MLProcess has to be executed several times. 
     * 
     * @param fep The {@link FeatureExtractionProcess} containing the desired list of features.
     * @param gold The {@link MMAXParser} containing the gold standard.
     * @param testList Predictions of ACR systems for the current file.
     * @throws Exception If an error occurred during the MLProcess. 
     */
    public void run(FeatureExtractionProcess fep, MMAXParser gold, Map<String, List<CoreferencePair>> testList)
            throws Exception {

        /* gold standard */
        List<CoreferencePair> goldStandard = gold.getCorefPairs();

        /* feature extraction for the attribute structure */
        fep.extractFeatures(goldStandard, gold);
        ic = new InstanceContainer();
        ic.createAttributeStructure(fep.getFeatureExtractorList());

        for (List<CoreferencePair> corefPairs : testList.values()) {
            fep.extractFeatures(corefPairs, gold);
        }

        /* positive predictions */
        List<CoreferencePair> predictions = createPrediction(testList);
        //predictions = removeDuplicates(predictions);
        doEvaluation(predictions, goldStandard, gold.getDiscourse().getCommonBasedataPath());
        //this.writer.writeOverallResult(eval);
    }

    /**
     * Setter to manually set the features which should be extracted.
     * @param fep The {@link FeatureExtractionProcess} containing the desired features.
     */
    public void setFep(FeatureExtractionProcess fep) {
        this.fep = fep;
    }

    /** 
     * Getter for the {@link Evaluation} results (to compare multiple MLProcesses). 
     * @return The corresponding {@link Evaluation} object.
     */
    public Evaluation getEval() {
        return eval;
    }
}