de.tudarmstadt.ukp.dkpro.keyphrases.core.evaluator.KeyphraseDatasetStatistics.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.keyphrases.core.evaluator.KeyphraseDatasetStatistics.java
Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.keyphrases.core.evaluator;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.keyphrases.core.evaluator.util.EvaluatorUtils;
//import org.apache.commons.math.stat.correlation.PearsonsCorrelation;

/**
 * Outputs statists about a keyphrase dataset.
 * @author zesch
 *
 */
public class KeyphraseDatasetStatistics extends JCasAnnotator_ImplBase {

    // In some cases the capitalization of the gold standard keyphrases
    // does not reflect capitalization in the text
    // As there are only rare cases where a keyphrase should be
    // included in uppercase, but not in lowercase (e.g. apple vs. Apple)
    // this seems acceptable. The behavior can be overwritten by a parameter.
    public static final String PARAM_LOWERCASE = "lowercase";
    @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = false, defaultValue = "false")
    private boolean toLowercase;

    // The suffix of the gold keyphrase files. Default is "key".
    // Suffixes should always be given with the ".".
    public static final String PARAM_GOLD_SUFFIX = "GoldSuffix";
    @ConfigurationParameter(name = PARAM_GOLD_SUFFIX, mandatory = false, defaultValue = ".key")
    private String goldSuffix;

    private static final String LINE_SEPARATOR = System.getProperty("line.separator");
    private static final String MORE_LESS_LITERAL = "(+/- ";
    private static final int MIN_STD_DEV_SIZE = 1;

    private static int nrofDocuments;
    private static int nrofKeyphrases;
    private static int sumNrofTokens;
    private static int sumLgthKeyphrs;
    private static List<Integer> tknsPerKeyphr;
    private static List<Integer> charsPerKeyphr;
    private static List<Integer> tokensPerDocument;
    private static List<Integer> keyphrsPerDoc;

    private static List<Double> upperBoundP10s;
    private static List<Double> upperBoundR10s;

    private static List<Integer> tokenSizeList;
    private static List<Integer> goldSizeList;

    @Override
    public void initialize(final UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        nrofDocuments = 0;
        nrofKeyphrases = 0;
        sumNrofTokens = 0;
        sumLgthKeyphrs = 0;
        tknsPerKeyphr = new ArrayList<Integer>();
        charsPerKeyphr = new ArrayList<Integer>();
        tokensPerDocument = new ArrayList<Integer>();
        keyphrsPerDoc = new ArrayList<Integer>();

        upperBoundP10s = new ArrayList<Double>();
        upperBoundR10s = new ArrayList<Double>();

        tokenSizeList = new ArrayList<Integer>();
        goldSizeList = new ArrayList<Integer>();
    }

    @Override
    public void process(final JCas jcas) throws AnalysisEngineProcessException {

        nrofDocuments++;

        final DocumentMetaData docMetaData = DocumentMetaData.get(jcas);
        final String currentTitle = docMetaData.getDocumentTitle();
        getContext().getLogger().log(Level.INFO, "Document title: " + currentTitle);

        // get the gold keyphrases from the file system
        final Set<String> goldKeyphrases = EvaluatorUtils.getGoldKeyphrases(docMetaData, goldSuffix, toLowercase);

        final AnnotationIndex<Annotation> tokenIndex = jcas.getAnnotationIndex(Token.type);
        sumNrofTokens += tokenIndex.size();
        tokensPerDocument.add(tokenIndex.size());

        for (final String goldKeyphrase : goldKeyphrases) {
            sumLgthKeyphrs += goldKeyphrase.length();
            tknsPerKeyphr.add(goldKeyphrase.split("\\s+").length);
            charsPerKeyphr.add(goldKeyphrase.length());

            nrofKeyphrases++;
        }

        keyphrsPerDoc.add(goldKeyphrases.size());

        tokenSizeList.add(tokenIndex.size());
        goldSizeList.add(goldKeyphrases.size());

        int n = goldKeyphrases.size();
        if (n < 10) {
            upperBoundP10s.add(Double.valueOf(1 - (double) (10 - n) / 10));
            upperBoundR10s.add(1d);
        } else {
            upperBoundP10s.add(1d);
            upperBoundR10s.add((double) 10 / n);

        }

    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        final double averageNrofTokens = (double) sumNrofTokens / nrofDocuments;
        final double stdDevAvgNrTokens = stdDev(tokensPerDocument);
        final double avgLngthKeyphrs = (double) sumLgthKeyphrs / nrofKeyphrases;
        final double stdDevAvgLnthKphr = stdDev(charsPerKeyphr);
        final double avgToksKeyphrs = mean(tknsPerKeyphr);
        final double stdDevToksKeyphr = stdDev(tknsPerKeyphr);
        final double avgKeyphrPerDoc = (double) nrofKeyphrases / nrofDocuments;
        final double stdDevKeyphrDoc = stdDev(keyphrsPerDoc);

        final double upperBoundP5 = meanDouble(upperBoundP10s);
        final double upperBoundR5 = meanDouble(upperBoundR10s);

        //        double[] tokenSizeArray = listToArray(tokenSizeList);
        //        double[] goldSizeArray  = listToArray(goldSizeList);

        final StringBuilder stringBuilder = new StringBuilder(272);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("# Documents:               ");
        stringBuilder.append(nrofDocuments);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("Tokens / Document:         ");
        stringBuilder.append(averageNrofTokens);
        stringBuilder.append(MORE_LESS_LITERAL);
        stringBuilder.append(stdDevAvgNrTokens);
        stringBuilder.append(") Median: ");
        stringBuilder.append(median(tokensPerDocument));
        stringBuilder.append(')');
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("# Keyphrases:              ");
        stringBuilder.append(nrofKeyphrases);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("Keyphrases / Document:     ");
        stringBuilder.append(avgKeyphrPerDoc);
        stringBuilder.append(MORE_LESS_LITERAL);
        stringBuilder.append(stdDevKeyphrDoc);
        stringBuilder.append(')');
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("Characters / Keyphrase:    ");
        stringBuilder.append(avgLngthKeyphrs);
        stringBuilder.append(MORE_LESS_LITERAL);
        stringBuilder.append(stdDevAvgLnthKphr);
        stringBuilder.append(')');
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("Tokens / Keyphrase:        ");
        stringBuilder.append(avgToksKeyphrs);
        stringBuilder.append(MORE_LESS_LITERAL);
        stringBuilder.append(stdDevToksKeyphr);
        stringBuilder.append(')');
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("Upper bound (P@10):        ");
        stringBuilder.append(upperBoundP5);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("Upper bound (R@10):        ");
        stringBuilder.append(upperBoundR5);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append("Pearson Correlation between document size and the number of gold keyphrases:");
        stringBuilder.append(LINE_SEPARATOR);
        //        sb.append(new PearsonsCorrelation().correlation(
        //        tokenSizeArray, goldSizeArray));
        //        sb.append(LF);
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append(getNrOfTokensPerKeyphraseHistogramm());
        stringBuilder.append(LINE_SEPARATOR);
        stringBuilder.append(LINE_SEPARATOR);
        getContext().getLogger().setOutputStream(System.out);
        getContext().getLogger().log(Level.INFO, stringBuilder.toString());
    }

    //    private double[] listToArray(List<Integer> list) {
    //        double[] array = new double[list.size()];
    //        int i = 0;
    //        for (int value : list) {
    //            array[i] = value;
    //            i++;
    //        }
    //        return array;
    //    }

    private String getNrOfTokensPerKeyphraseHistogramm() {
        final Map<Integer, Integer> tksPerKeyphrMap = new TreeMap<Integer, Integer>();
        for (final Integer nrOfTokens : tknsPerKeyphr) {
            if (tksPerKeyphrMap.containsKey(nrOfTokens)) {
                tksPerKeyphrMap.put(nrOfTokens, tksPerKeyphrMap.get(nrOfTokens) + 1);
            } else {
                tksPerKeyphrMap.put(nrOfTokens, 1);
            }
        }

        final StringBuilder stringBuilder = new StringBuilder();
        for (final Integer nrOfTokens : tksPerKeyphrMap.keySet()) {
            stringBuilder.append(nrOfTokens);
            stringBuilder.append(':');
            stringBuilder.append(tksPerKeyphrMap.get(nrOfTokens));
            stringBuilder.append(LINE_SEPARATOR);
        }
        return stringBuilder.toString();
    }

    public double mean(final List<Integer> values) {
        double mean = 0.0;
        if (!values.isEmpty()) {
            double sum = 0.0;
            for (final Integer value : values) {
                sum += value;
            }

            mean = sum / values.size();
        }
        return mean;
    }

    public double meanDouble(final List<Double> values) {
        double mean = 0.0;
        if (!values.isEmpty()) {
            double sum = 0.0;
            for (final Double value : values) {
                sum += value;
            }

            mean = sum / values.size();
        }
        return mean;
    }

    public double median(final List<Integer> values) {
        final DescriptiveStatistics stats = new DescriptiveStatistics();
        for (final Integer value : values) {
            stats.addValue(value);
        }
        return stats.getPercentile(0.5);

    }

    public double stdDev(final List<Integer> values) {
        double stdDev = 0.0;

        if (values.size() > MIN_STD_DEV_SIZE) {
            final double meanValue = mean(values);
            double sum = 0.0;

            for (final Integer value : values) {
                final double diff = value - meanValue;
                sum += diff * diff;
            }
            stdDev = Math.sqrt(sum / (values.size()));
        }
        return stdDev;
    }

}