Example usage for weka.core Instances enumerateInstances

List of usage examples for weka.core Instances enumerateInstances

Introduction

In this page you can find the example usage for weka.core Instances enumerateInstances.

Prototype

publicEnumeration<Instance> enumerateInstances() 

Source Link

Document

Returns an enumeration of all instances in the dataset.

Usage

From source file:myclassifier.myC45Pack.SplitModel.java

private void handleNominalAttribute(Instances dataSet) throws Exception {

    Instance instance;/*from   w ww . j a v a  2 s  .  c  o  m*/
    classDist = new ClassDistribution(numOfBranches, dataSet.numClasses());
    Enumeration instanceEnum = dataSet.enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
        instance = (Instance) instanceEnum.nextElement();
        if (!instance.isMissing(attribIndex)) {
            classDist.addInstance((int) instance.value(attribIndex), instance);
        }
    }

    // Check if minimum number of Instances in at least two
    // subsets.
    if (classDist.isSplitable(minInstances)) {
        numSubsets = numOfBranches;
        infoGain = classDist.calculateInfoGain(totalWeights);
        gainRatio = classDist.calculateGainRatio(infoGain);
    }
}

From source file:myclassifier.myC45Pack.SplitModel.java

private void handleNumericAttribute(Instances dataSet) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double currentGainRatio;
    double minSplit;
    Instance instance;/* ww  w . java 2  s  .  co m*/
    int i;
    boolean instanceMissing = false;

    // Current attribute is a numeric attribute.
    classDist = new ClassDistribution(2, dataSet.numClasses());

    // Only Instances with known values are relevant.
    Enumeration instanceEnum = dataSet.enumerateInstances();
    i = 0;
    while ((instanceEnum.hasMoreElements() && (!instanceMissing))) {
        instance = (Instance) instanceEnum.nextElement();
        if (instance.isMissing(attribIndex)) {
            instanceMissing = true;
        } else {
            classDist.addInstance(1, instance);
            i++;
        }
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (classDist.getTotalWeight()) / ((double) dataSet.numClasses());
    if (minSplit <= minInstances) {
        minSplit = minInstances;
    } else if (minSplit > 25) {
        minSplit = 25;
    }
    // Enough Instances with known values?
    if ((double) firstMiss < 2 * minSplit) {
        return;
    }
    // Compute values of criteria for all possible split
    // indices.
    //defaultEnt = infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {
        if (dataSet.instance(next - 1).value(attribIndex) + 1e-5 < dataSet.instance(next).value(attribIndex)) {

            // Move class values for all Instances up to next 
            // possible split point.
            classDist.moveInstancesWithRange(1, 0, dataSet, last, next);

            // Check if enough Instances in each subset and compute
            // values for criteria.
            if ((classDist.w_perSubdataset[0] >= minSplit) && (classDist.w_perSubdataset[1] >= minSplit)) {
                currentInfoGain = classDist.calculateInfoGain(totalWeights);
                currentGainRatio = classDist.calculateGainRatio(totalWeights);
                if (currentGainRatio >= gainRatio) {
                    infoGain = currentInfoGain;
                    gainRatio = currentGainRatio;
                    splitIndex = next - 1;
                }
                numOfSplitPoints++;
            }
            last = next;
        }
        next++;
    }

    // Was there any useful split?
    if (numOfSplitPoints == 0) {
        return;
    }
    // Compute modified information gain for best split.
    infoGain = infoGain - (classDist.log2(numOfSplitPoints) / totalWeights);
    if (infoGain > 0) {
        // Set instance variables' values to values for
        // best split.
        numSubsets = 2;
        splitPointValue = (dataSet.instance(splitIndex + 1).value(attribIndex)
                + dataSet.instance(splitIndex).value(attribIndex)) / 2;

        // In case we have a numerical precision problem we need to choose the
        // smaller value
        if (splitPointValue == dataSet.instance(splitIndex + 1).value(attribIndex)) {
            splitPointValue = dataSet.instance(splitIndex).value(attribIndex);
        }
        // Restore distributioN for best split.
        classDist = new ClassDistribution(2, dataSet.numClasses());
        classDist.addRange(0, dataSet, 0, splitIndex + 1);
        classDist.addRange(1, dataSet, splitIndex + 1, firstMiss);
        // Compute modified gain ratio for best split.
        gainRatio = classDist.calculateGainRatio(infoGain);
    }
}

From source file:myclassifier.myC45Pack.SplitModel.java

public final void setSplitPoint(Instances allInstances) {

    double newSplitPoint = -Double.MAX_VALUE;
    double temp;/*from w w  w .  j  ava 2s . c  o  m*/
    Instance instance;

    if ((allInstances.attribute(attribIndex).isNumeric()) && (numSubsets > 1)) {
        Enumeration instancesEnum = allInstances.enumerateInstances();
        while (instancesEnum.hasMoreElements()) {
            instance = (Instance) instancesEnum.nextElement();
            if (!instance.isMissing(attribIndex)) {
                temp = instance.value(attribIndex);
                if ((temp > newSplitPoint) && (temp <= splitPointValue)) {
                    newSplitPoint = temp;
                }
            }
        }
        splitPointValue = newSplitPoint;
    }
}

From source file:myID3.MyId3.java

/**
 * Construct the tree using the given instance
 * Find the highest attribute value which best at dividing the data
 * @param data Instance/* w  w w . ja va2s .  c o m*/
 */
public void buildTree(Instances data) {
    if (data.numInstances() > 0) {
        // Lets find the highest Information Gain!
        // First compute each information gain attribute
        double IG[] = new double[data.numAttributes()];
        Enumeration enumAttribute = data.enumerateAttributes();
        while (enumAttribute.hasMoreElements()) {
            Attribute attribute = (Attribute) enumAttribute.nextElement();
            IG[attribute.index()] = informationGain(data, attribute);
            // System.out.println(attribute.toString() + ": " + IG[attribute.index()]);
        }
        // Assign it as the tree attribute!
        currentAttribute = data.attribute(maxIndex(IG));
        //System.out.println(Arrays.toString(IG) + IG[currentAttribute.index()]);

        // IG = 0 then current node = leaf!
        if (Utils.eq(IG[currentAttribute.index()], 0)) {
            // Set the class value as the highest frequency of the class
            currentAttribute = null;
            classDistribution = new double[data.numClasses()];
            Enumeration enumInstance = data.enumerateInstances();
            while (enumInstance.hasMoreElements()) {
                Instance temp = (Instance) enumInstance.nextElement();
                classDistribution[(int) temp.classValue()]++;
            }
            Utils.normalize(classDistribution);
            classValue = Utils.maxIndex(classDistribution);
            classAttribute = data.classAttribute();
        } else {
            // Create another node from the current tree
            Instances[] splitData = splitDataByAttribute(data, currentAttribute);
            nodes = new MyId3[currentAttribute.numValues()];

            for (int i = 0; i < currentAttribute.numValues(); i++) {
                nodes[i] = new MyId3();
                nodes[i].buildTree(splitData[i]);
            }
        }
    } else {
        classAttribute = null;
        classValue = Utils.missingValue();
        classDistribution = new double[data.numClasses()];
    }
}

From source file:myID3.MyId3.java

/**
 * Find the entropy from a given dataset
 * @param data/*w  w  w .j ava 2 s  .c  o  m*/
 * @return 
 */
private double entropy(Instances data) {

    /*  Entropy = -(p1 log2 p1) -(p2 log2 p2).... */

    double numInstance = data.numInstances();
    double numClass = data.numClasses();
    double[] distribution = new double[data.numClasses()];

    Enumeration instance = data.enumerateInstances();
    while (instance.hasMoreElements()) {
        Instance temp = (Instance) instance.nextElement();
        /* Count the p1, p2 */
        distribution[(int) temp.classValue()]++;
    }

    /* Sum all the distribution */
    double sum = 0;
    for (int i = 0; i < numClass; i++) {
        distribution[i] = distribution[i] / numInstance;
        if (distribution[i] > 0.0)
            distribution[i] *= Utils.log2(distribution[i]);
        // System.out.println(Arrays.toString(distribution));
        sum += distribution[i];
    }

    return -1 * sum;
}

From source file:myID3.MyId3.java

/**
 * Create split of data based on the value of attribute
 * @param data//from  w ww  .  j  av  a 2s .co m
 * @param attribute
 * @return 
 */
private Instances[] splitDataByAttribute(Instances data, Attribute attribute) {

    // Init the object first
    Instances[] subSet = new Instances[attribute.numValues()];
    for (int i = 0; i < attribute.numValues(); i++) {
        subSet[i] = new Instances(data, data.numInstances());
    }

    // Split it!
    Enumeration instanceEnum = data.enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
        Instance instance = (Instance) instanceEnum.nextElement();
        subSet[(int) instance.value(attribute)].add(instance);
    }

    // Compact the array of object by removing the empty array
    for (int i = 0; i < attribute.numValues(); i++) {
        subSet[i].compactify();
        // System.out.println(subSet[i]);
    }

    return subSet;
}

From source file:myid3andc45classifier.Model.MyC45.java

@Override
public void buildClassifier(Instances data) throws Exception {
    getCapabilities().testWithFail(data);

    data = new Instances(data);
    data.deleteWithMissingClass();/*from   ww  w .j  av  a  2  s  .co  m*/

    Enumeration enumAtt = data.enumerateAttributes();
    while (enumAtt.hasMoreElements()) {
        Attribute attr = (Attribute) enumAtt.nextElement();
        if (attr.isNumeric()) {
            ArrayList<Double> mid = new ArrayList<Double>();
            Instances savedData = null;
            double temp, max = Double.NEGATIVE_INFINITY;
            // TODO: split nominal
            data.sort(attr);
            for (int i = 0; i < data.numInstances() - 1; i++) {
                if (data.instance(i).classValue() != data.instance(i + 1).classValue()) {
                    if (data.attribute(attr.name() + " "
                            + (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2) == null) {
                        data = convertInstances(data, attr,
                                (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2);
                        //temp = computeInfoGainRatio(newData, newData.attribute(newData.numAttributes()-1));
                        //System.out.println("attribute "+newData.attribute(newData.numAttributes()-1).name());
                        //if (temp > max) {
                        //    max = temp;
                        //    savedData = newData;
                        //}
                    }
                }
            }

            //Penanganan Missing Value
            AttributeStats attributeStats = data.attributeStats(attr.index());
            double mean = attributeStats.numericStats.mean;
            if (Double.isNaN(mean))
                mean = 0;
            // Replace missing value with mean
            Enumeration instEnumerate = data.enumerateInstances();
            while (instEnumerate.hasMoreElements()) {
                Instance instance = (Instance) instEnumerate.nextElement();
                if (instance.isMissing(attr.index())) {
                    instance.setValue(attr.index(), mean);
                }
            }

            //data = new Instances(savedData);
        } else {
            //Penanganan Missing Value
            AttributeStats attributeStats = data.attributeStats(attr.index());
            int maxIndex = 0;
            for (int i = 1; i < attr.numValues(); i++) {
                if (attributeStats.nominalCounts[maxIndex] < attributeStats.nominalCounts[i]) {
                    maxIndex = i;
                }
            }
            // Replace missing value with max index
            Enumeration instEnumerate = data.enumerateInstances();
            while (instEnumerate.hasMoreElements()) {
                Instance instance = (Instance) instEnumerate.nextElement();
                if (instance.isMissing(attr.index())) {
                    instance.setValue(attr.index(), maxIndex);
                }
            }
        }
    }
    makeMyC45Tree(data);

}

From source file:myid3andc45classifier.Model.MyC45.java

public void makeMyC45Tree(Instances data) throws Exception {
    if (data.numInstances() == 0) {
        attribute = null;/*from   w  w  w .j av a  2s.  com*/
        label = Instance.missingValue();
        return;
    }
    //System.out.println("NEW");
    double[] infoGainRatios = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        if (!att.isNumeric())
            infoGainRatios[att.index()] = computeInfoGainRatio(data, att);
        else
            infoGainRatios[att.index()] = Double.NEGATIVE_INFINITY;
        //System.out.println(att.name() + " " + infoGainRatios[att.index()]);
    }

    // TODO: build the tree
    attribute = data.attribute(maxIndex(infoGainRatios));
    //System.out.println(infoGainRatios[maxIndex(infoGainRatios)]);
    // Make leaf if information gain is zero. 
    // Otherwise create successors.
    if (infoGainRatios[maxIndex(infoGainRatios)] <= epsilon
            || Double.isNaN(infoGainRatios[maxIndex(infoGainRatios)])) {
        attribute = null;
        double[] numClasses = new double[data.numClasses()];

        Enumeration instEnum = data.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            numClasses[(int) inst.classValue()]++;
        }

        label = maxIndex(numClasses);
        classAttribute = data.classAttribute();
    } else {
        classAttribute = data.classAttribute();
        Instances[] splitData = splitInstancesByAttribute(data, attribute);
        Instances[] distrData = splitInstancesByAttribute(data, data.classAttribute());
        distribution = new double[distrData.length];
        for (int j = 0; j < distribution.length; j++) {
            distribution[j] = distrData[j].numInstances();
        }
        successors = new MyC45[attribute.numValues()];
        for (int j = 0; j < attribute.numValues(); j++) {
            successors[j] = new MyC45();
            successors[j].buildClassifier(splitData[j]);
        }
    }
    // TODO: prune
    //pruneTree(data);
}

From source file:myid3andc45classifier.Model.MyC45.java

public double[] listClassCountsValues(Instances data) throws Exception {

    double[] classCounts = new double[data.numClasses()]; //array untuk menyimpan value kelas sesuai jumlah kelas
    Enumeration instanceEnum = data.enumerateInstances();

    //Masukkan data ke array
    while (instanceEnum.hasMoreElements()) {
        Instance inst = (Instance) instanceEnum.nextElement();
        classCounts[(int) inst.classValue()]++;
    }//from  w  ww  .  j  ava  2s.  co m

    return classCounts;
}

From source file:myid3andc45classifier.Model.MyC45.java

public Instances[] splitInstancesByAttribute(Instances data, Attribute attr) throws Exception {
    //Split data menjadi beberapa instances sesuai dengan jumlah jenis data pada atribut
    Instances[] splitData = new Instances[attr.numValues()];

    for (int i = 0; i < attr.numValues(); i++) {
        splitData[i] = new Instances(data, data.numInstances());
    }/*from  w  w  w. j a  v a2s  .  c om*/

    Enumeration instanceEnum = data.enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
        Instance inst = (Instance) instanceEnum.nextElement();
        splitData[(int) inst.value(attr)].add(inst);
    }

    for (int i = 0; i < splitData.length; i++) {
        splitData[i].compactify();
    }

    return splitData;
}