org.openml.webapplication.features.ExtractFeatures.java Source code

Java tutorial

Introduction

Here is the source code for org.openml.webapplication.features.ExtractFeatures.java

Source

/*
 *  Webapplication - Java library that runs on OpenML servers
 *  Copyright (C) 2014 
 *  @author Jan N. van Rijn (j.n.van.rijn@liacs.leidenuniv.nl)
 *  
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *  
 */
package org.openml.webapplication.features;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.openml.apiconnector.algorithms.Conversion;
import org.openml.apiconnector.xml.DataFeature.Feature;
import org.openml.apiconnector.xml.DataQuality.Quality;
import org.openml.webapplication.fantail.dc.Characterizer;
import org.openml.webapplication.fantail.dc.statistical.SimpleMetaFeatures;
import org.openml.webapplication.models.AttributeStatistics;

import weka.core.Attribute;
import weka.core.AttributeStats;
import weka.core.Instances;

public class ExtractFeatures {

    public static final int BASIC_FEATURES = SimpleMetaFeatures.ids.length;

    public static List<Feature> getFeatures(Instances dataset, String defaultClass) {
        if (defaultClass != null) {
            dataset.setClass(dataset.attribute(defaultClass));
        } else {
            dataset.setClassIndex(dataset.numAttributes() - 1);
        }

        final ArrayList<Feature> resultFeatures = new ArrayList<Feature>();

        for (int i = 0; i < dataset.numAttributes(); i++) {
            Attribute att = dataset.attribute(i);
            int numValues = dataset.classAttribute().isNominal() ? dataset.classAttribute().numValues() : 0;
            AttributeStatistics attributeStats = new AttributeStatistics(dataset.attribute(i), numValues);

            for (int j = 0; j < dataset.numInstances(); ++j) {
                attributeStats.addValue(dataset.get(j).value(i), dataset.get(j).classValue());
            }

            String data_type = null;

            Integer numberOfDistinctValues = null;
            Integer numberOfUniqueValues = null;
            Integer numberOfMissingValues = null;
            Integer numberOfIntegerValues = null;
            Integer numberOfRealValues = null;
            Integer numberOfNominalValues = null;
            Integer numberOfValues = null;

            Double maximumValue = null;
            Double minimumValue = null;
            Double meanValue = null;
            Double standardDeviation = null;

            AttributeStats as = dataset.attributeStats(i);

            numberOfDistinctValues = as.distinctCount;
            numberOfUniqueValues = as.uniqueCount;
            numberOfMissingValues = as.missingCount;
            numberOfIntegerValues = as.intCount;
            numberOfRealValues = as.realCount;
            numberOfMissingValues = as.missingCount;

            if (att.isNominal()) {
                numberOfNominalValues = att.numValues();
            }
            numberOfValues = attributeStats.getTotalObservations();

            if (att.isNumeric()) {
                maximumValue = attributeStats.getMaximum();
                minimumValue = attributeStats.getMinimum();
                meanValue = attributeStats.getMean();
                standardDeviation = 0.0;
                try {
                    standardDeviation = attributeStats.getStandardDeviation();
                } catch (Exception e) {
                    Conversion.log("WARNING", "StdDev", "Could not compute standard deviation of feature "
                            + att.name() + ": " + e.getMessage());
                }
            }

            if (att.type() == 0) {
                data_type = "numeric";
            } else if (att.type() == 1) {
                data_type = "nominal";
            } else if (att.type() == 2) {
                data_type = "string";
            } else {
                data_type = "unknown";
            }

            resultFeatures.add(new Feature(att.index(), att.name(), data_type, att.index() == dataset.classIndex(),
                    numberOfDistinctValues, numberOfUniqueValues, numberOfMissingValues, numberOfIntegerValues,
                    numberOfRealValues, numberOfNominalValues, numberOfValues, maximumValue, minimumValue,
                    meanValue, standardDeviation, attributeStats.getClassDistribution()));
        }
        return resultFeatures;
    }

    public static List<Quality> getQualities(Instances dataset, String defaultClass) {
        if (defaultClass != null) {
            dataset.setClass(dataset.attribute(defaultClass));
        } else {
            dataset.setClassIndex(dataset.numAttributes() - 1);
        }
        List<Quality> result = new ArrayList<Quality>();
        Characterizer simpleQualities = new SimpleMetaFeatures();
        Map<String, Double> qualities = simpleQualities.characterize(dataset);
        for (String quality : qualities.keySet()) {
            result.add(new Quality(quality, qualities.get(quality) + ""));
        }
        return result;
    }
}