Example usage for weka.core Instances enumerateInstances

List of usage examples for weka.core Instances enumerateInstances

Introduction

In this page you can find the example usage for weka.core Instances enumerateInstances.

Prototype

publicEnumeration<Instance> enumerateInstances() 

Source Link

Document

Returns an enumeration of all instances in the dataset.

Usage

From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/*w  w w .ja  v  a  2 s.co  m*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    // we assume that only this method has been used - breaks modularity, but need results fast ... :/
    SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer;

    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);
    Instances copyTrainData = new Instances(trainData);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);
    Instances centroids = clusterer.getClusterCentroids();

    //        Add addFilter = new Add();
    //        addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString());
    //        addFilter.setNominalLabels("0,1");
    //        addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
    //        addFilter.setInputFormat(testData);

    trainData.clear();

    Enumeration<Instance> centroidInstances = centroids.enumerateInstances();
    while (centroidInstances.hasMoreElements()) {
        Instance centroidInstance = centroidInstances.nextElement();

        // centroidInstance is usually not a real instance, but a virtual centroid
        // we need to find the closest point in the training data
        double minDistance = Double.POSITIVE_INFINITY;
        int offset = 0;
        int minOffset = 0;
        Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances();
        while (trainInstances.hasMoreElements()) {
            Instance trainInstance = trainInstances.nextElement();

            double dist = distance(centroidInstance, trainInstance);
            if (dist < minDistance) {
                minDistance = dist;
                minOffset = offset;
            }
            offset++;
        }

        // add selected instance to instances
        trainData.add(copyTrainData.get(minOffset));
    }

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, trainData);
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

/**
 * Returns a mapping from cluster IDs to instance offsets
 * @return/*from w  ww . j a va2  s.  co  m*/
 */
private Map<Integer, Set<Integer>> getClusterMap(Instances data, Clusterer clusterer) throws Exception {
    Map<Integer, Set<Integer>> clusterMap = new HashMap<Integer, Set<Integer>>();

    @SuppressWarnings("rawtypes")
    Enumeration instanceEnumeration = data.enumerateInstances();
    int instanceOffset = 0;
    while (instanceEnumeration.hasMoreElements()) {
        Instance instance = (Instance) instanceEnumeration.nextElement();
        double[] distribution = clusterer.distributionForInstance(instance);
        int clusterId = 0;
        for (double value : distribution) {
            if (new Double(value).intValue() == 1) {
                Set<Integer> clusterInstances;
                if (!clusterMap.containsKey(clusterId)) {
                    clusterInstances = new HashSet<Integer>();
                    clusterMap.put(clusterId, clusterInstances);
                }
                clusterInstances = clusterMap.get(clusterId);
                clusterInstances.add(instanceOffset);
                clusterMap.put(clusterId, clusterInstances);
            }
            clusterId++;
        }
        instanceOffset++;
    }

    return clusterMap;
}

From source file:decisiontree.MyC45.java

/**
* Method for building an C45 tree.//  w ww. j  a  v  a2  s. c  o m
*
* @param instances the training data
* @exception Exception if decision tree can't be built successfully
*/
private void makeTree(Instances instances) throws Exception {

    // Check if no instances have reached this node.
    if (instances.numInstances() == 0) {
        m_Attribute = null;
        m_ClassValue = Instance.missingValue();
        m_Distribution = new double[instances.numClasses()];
        return;
    }

    // Compute attribute with maximum gain ratio.
    double[] gainRatios = new double[instances.numAttributes()];
    Enumeration attrEnum = instances.enumerateAttributes();
    while (attrEnum.hasMoreElements()) {
        Attribute attr = (Attribute) attrEnum.nextElement();
        if (attr.isNominal()) {
            gainRatios[attr.index()] = computeGainRatio(instances, attr);
        } else if (attr.isNumeric()) {
            gainRatios[attr.index()] = computeGainRatio(instances, attr, computeThreshold(instances, attr));
        }
    }
    m_Attribute = instances.attribute(Utils.maxIndex(gainRatios));

    // Make leaf if gain ratio is zero. 
    // Otherwise create successors.
    if (Utils.eq(gainRatios[m_Attribute.index()], 0)) {
        m_Attribute = null;
        m_Distribution = new double[instances.numClasses()];
        Enumeration instEnum = instances.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            m_Distribution[(int) inst.classValue()]++;
        }
        Utils.normalize(m_Distribution);
        m_ClassValue = Utils.maxIndex(m_Distribution);
        m_ClassAttribute = instances.classAttribute();
    } else {
        Instances[] splitData = null;
        int child = 0;
        if (m_Attribute.isNominal()) {
            child = m_Attribute.numValues();
            splitData = splitData(instances, m_Attribute);
        } else if (m_Attribute.isNumeric()) {
            child = 2;
            splitData = splitData(instances, m_Attribute, computeThreshold(instances, m_Attribute));
        }
        m_Successors = new MyC45[child];
        for (int j = 0; j < child; j++) {
            m_Successors[j] = new MyC45();
            m_Successors[j].makeTree(splitData[j]);
        }
    }
}

From source file:decisiontree.MyC45.java

/**
* Splits a dataset according to the values of a nominal attribute.
*
* @param data the data which is to be split
* @param att the attribute to be used for splitting
* @return the sets of instances produced by the split
*///from ww  w .j  a  v a 2  s.  co  m
private Instances[] splitData(Instances data, Attribute att) {
    Instances[] splitData = new Instances[att.numValues()];
    for (int j = 0; j < att.numValues(); j++) {
        splitData[j] = new Instances(data, data.numInstances());
    }
    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        splitData[(int) inst.value(att)].add(inst);
    }
    for (int i = 0; i < splitData.length; i++) {
        splitData[i].compactify();
    }
    return splitData;
}

From source file:decisiontree.MyC45.java

/**
* Splits a dataset according to the values of a numeric attribute.
*
* @param data the data which is to be split
* @param att the attribute to be used for splitting
* @return the sets of instances produced by the split
*///from   w  ww. j  a  va  2s  .  c  o  m
private Instances[] splitData(Instances data, Attribute att, double threshold) {
    Instances[] splitData = new Instances[2];
    for (int i = 0; i < 2; i++) {
        splitData[i] = new Instances(data, data.numInstances());
    }

    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        if (inst.value(att) >= threshold) {
            inst.setValue(att, threshold);
            splitData[1].add(inst);
        } else {
            inst.setValue(att, 0);
            splitData[0].add(inst);
        }
    }
    for (int i = 0; i < splitData.length; i++) {
        splitData[i].compactify();
    }
    return splitData;
}

From source file:decisiontree.MyID3.java

private void makeTree(Instances data) {
    // Check if no instances have reached this node.  
    if (data.numInstances() == 0) {
        splitAttr = null;/*from ww  w  .  jav  a  2 s. co  m*/
        leafValue = Double.NaN;
        leafDist = new double[data.numClasses()];
        return;
    }

    if (data.numDistinctValues(data.classIndex()) == 1) {
        leafValue = data.firstInstance().classValue();
        return;
    }

    // Compute attribute with maximum information gain.  
    double[] infoGains = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        infoGains[att.index()] = computeInfoGain(data, att);
    }
    splitAttr = data.attribute(maxIndex(infoGains));

    // Make leaf if information gain is zero.   
    // Otherwise create successors.  
    if (Utils.eq(infoGains[splitAttr.index()], 0)) {
        splitAttr = null;
        leafDist = new double[data.numClasses()];
        Enumeration instEnum = data.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            leafDist[(int) inst.classValue()]++;
        }
        normalize(leafDist);
        leafValue = Utils.maxIndex(leafDist);
        classAttr = data.classAttribute();
    } else {
        Instances[] splitData = splitData(data, splitAttr);
        child = new MyID3[splitAttr.numValues()];
        for (int j = 0; j < splitAttr.numValues(); j++) {
            child[j] = new MyID3();
            child[j].makeTree(splitData[j]);
        }
    }
}

From source file:decisiontree.MyID3.java

private double computeEntropy(Instances data) {
    int numClasses = data.numClasses();
    int[] classCount = new int[numClasses];
    ArrayList<Double> classValues = new ArrayList<>();
    Enumeration<Instance> instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance instance = instEnum.nextElement();
        double classValue = instance.classValue();
        if (!classValues.contains(classValue)) {
            classValues.add(classValue);
        }//from  w  w  w  .  jav  a2s  .  c o  m
        int index = classValues.indexOf(classValue);
        classCount[index]++;
    }
    double entropy = 0.0;
    for (Double value : classValues) {
        int index = classValues.indexOf(value);
        if (classCount[index] > 0) {
            double temp = (double) classCount[index] / data.numInstances();
            entropy -= temp * Utils.log2(temp);
        }
    }
    return entropy;

}

From source file:decisiontree.MyID3.java

private Instances[] splitData(Instances data, Attribute att) {
    Instances[] splitData = new Instances[att.numValues()];
    for (int j = 0; j < att.numValues(); j++) {
        splitData[j] = new Instances(data, data.numInstances());
    }/*w  ww.  ja va  2  s  .com*/

    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        splitData[(int) inst.value(att)].add(inst);
    }
    for (Instances split : splitData) {
        split.compactify();
    }
    return splitData;
}

From source file:dewaweebtreeclassifier.Sujeong.java

public double computeEntropy(Instances data) {
    double[] nClass = new double[data.numClasses()];
    Enumeration enumInstance = data.enumerateInstances();
    while (enumInstance.hasMoreElements()) {
        Instance instance = (Instance) enumInstance.nextElement();
        nClass[(int) instance.classValue()]++;
    }/*from w w  w  .  j a  va  2 s.  co m*/

    double entropy = 0.0;
    for (int i = 0; i < data.numClasses(); i++) {
        if (nClass[i] > 0) {
            double ratio = nClass[i] / data.numInstances();
            entropy -= (ratio * Utils.log2(ratio));
        }
    }

    return entropy;
}

From source file:dewaweebtreeclassifier.Sujeong.java

public Instances[] splitInstancesOnAttribute(Instances data, Attribute attr) {
    Instances[] splitInstances = new Instances[attr.numValues()];

    for (int i = 0; i < attr.numValues(); i++) {
        splitInstances[i] = new Instances(data, data.numInstances());
    }/*from   w  ww  . j  a v  a 2  s  .c o m*/

    Enumeration enumInstance = data.enumerateInstances();
    while (enumInstance.hasMoreElements()) {
        Instance instance = (Instance) enumInstance.nextElement();
        splitInstances[(int) instance.value(attr)].add(instance);
    }

    for (int i = 0; i < attr.numValues(); i++) {
        splitInstances[i].compactify();
    }

    return splitInstances;
}