List of usage examples for weka.core Instances enumerateInstances
publicEnumeration<Instance> enumerateInstances()
From source file:myclassifier.myC45Pack.SplitModel.java
private void handleNominalAttribute(Instances dataSet) throws Exception { Instance instance;/*from w ww . j a v a 2 s . c o m*/ classDist = new ClassDistribution(numOfBranches, dataSet.numClasses()); Enumeration instanceEnum = dataSet.enumerateInstances(); while (instanceEnum.hasMoreElements()) { instance = (Instance) instanceEnum.nextElement(); if (!instance.isMissing(attribIndex)) { classDist.addInstance((int) instance.value(attribIndex), instance); } } // Check if minimum number of Instances in at least two // subsets. if (classDist.isSplitable(minInstances)) { numSubsets = numOfBranches; infoGain = classDist.calculateInfoGain(totalWeights); gainRatio = classDist.calculateGainRatio(infoGain); } }
From source file:myclassifier.myC45Pack.SplitModel.java
private void handleNumericAttribute(Instances dataSet) throws Exception { int firstMiss; int next = 1; int last = 0; int splitIndex = -1; double currentInfoGain; double currentGainRatio; double minSplit; Instance instance;/* ww w . java 2 s . co m*/ int i; boolean instanceMissing = false; // Current attribute is a numeric attribute. classDist = new ClassDistribution(2, dataSet.numClasses()); // Only Instances with known values are relevant. Enumeration instanceEnum = dataSet.enumerateInstances(); i = 0; while ((instanceEnum.hasMoreElements() && (!instanceMissing))) { instance = (Instance) instanceEnum.nextElement(); if (instance.isMissing(attribIndex)) { instanceMissing = true; } else { classDist.addInstance(1, instance); i++; } } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (classDist.getTotalWeight()) / ((double) dataSet.numClasses()); if (minSplit <= minInstances) { minSplit = minInstances; } else if (minSplit > 25) { minSplit = 25; } // Enough Instances with known values? if ((double) firstMiss < 2 * minSplit) { return; } // Compute values of criteria for all possible split // indices. //defaultEnt = infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (dataSet.instance(next - 1).value(attribIndex) + 1e-5 < dataSet.instance(next).value(attribIndex)) { // Move class values for all Instances up to next // possible split point. classDist.moveInstancesWithRange(1, 0, dataSet, last, next); // Check if enough Instances in each subset and compute // values for criteria. if ((classDist.w_perSubdataset[0] >= minSplit) && (classDist.w_perSubdataset[1] >= minSplit)) { currentInfoGain = classDist.calculateInfoGain(totalWeights); currentGainRatio = classDist.calculateGainRatio(totalWeights); if (currentGainRatio >= gainRatio) { infoGain = currentInfoGain; gainRatio = currentGainRatio; splitIndex = next - 1; } numOfSplitPoints++; } last = next; } next++; } // Was there any useful split? if (numOfSplitPoints == 0) { return; } // Compute modified information gain for best split. infoGain = infoGain - (classDist.log2(numOfSplitPoints) / totalWeights); if (infoGain > 0) { // Set instance variables' values to values for // best split. numSubsets = 2; splitPointValue = (dataSet.instance(splitIndex + 1).value(attribIndex) + dataSet.instance(splitIndex).value(attribIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (splitPointValue == dataSet.instance(splitIndex + 1).value(attribIndex)) { splitPointValue = dataSet.instance(splitIndex).value(attribIndex); } // Restore distributioN for best split. classDist = new ClassDistribution(2, dataSet.numClasses()); classDist.addRange(0, dataSet, 0, splitIndex + 1); classDist.addRange(1, dataSet, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. gainRatio = classDist.calculateGainRatio(infoGain); } }
From source file:myclassifier.myC45Pack.SplitModel.java
public final void setSplitPoint(Instances allInstances) { double newSplitPoint = -Double.MAX_VALUE; double temp;/*from w w w . j ava 2s . c o m*/ Instance instance; if ((allInstances.attribute(attribIndex).isNumeric()) && (numSubsets > 1)) { Enumeration instancesEnum = allInstances.enumerateInstances(); while (instancesEnum.hasMoreElements()) { instance = (Instance) instancesEnum.nextElement(); if (!instance.isMissing(attribIndex)) { temp = instance.value(attribIndex); if ((temp > newSplitPoint) && (temp <= splitPointValue)) { newSplitPoint = temp; } } } splitPointValue = newSplitPoint; } }
From source file:myID3.MyId3.java
/** * Construct the tree using the given instance * Find the highest attribute value which best at dividing the data * @param data Instance/* w w w . ja va2s . c o m*/ */ public void buildTree(Instances data) { if (data.numInstances() > 0) { // Lets find the highest Information Gain! // First compute each information gain attribute double IG[] = new double[data.numAttributes()]; Enumeration enumAttribute = data.enumerateAttributes(); while (enumAttribute.hasMoreElements()) { Attribute attribute = (Attribute) enumAttribute.nextElement(); IG[attribute.index()] = informationGain(data, attribute); // System.out.println(attribute.toString() + ": " + IG[attribute.index()]); } // Assign it as the tree attribute! currentAttribute = data.attribute(maxIndex(IG)); //System.out.println(Arrays.toString(IG) + IG[currentAttribute.index()]); // IG = 0 then current node = leaf! if (Utils.eq(IG[currentAttribute.index()], 0)) { // Set the class value as the highest frequency of the class currentAttribute = null; classDistribution = new double[data.numClasses()]; Enumeration enumInstance = data.enumerateInstances(); while (enumInstance.hasMoreElements()) { Instance temp = (Instance) enumInstance.nextElement(); classDistribution[(int) temp.classValue()]++; } Utils.normalize(classDistribution); classValue = Utils.maxIndex(classDistribution); classAttribute = data.classAttribute(); } else { // Create another node from the current tree Instances[] splitData = splitDataByAttribute(data, currentAttribute); nodes = new MyId3[currentAttribute.numValues()]; for (int i = 0; i < currentAttribute.numValues(); i++) { nodes[i] = new MyId3(); nodes[i].buildTree(splitData[i]); } } } else { classAttribute = null; classValue = Utils.missingValue(); classDistribution = new double[data.numClasses()]; } }
From source file:myID3.MyId3.java
/** * Find the entropy from a given dataset * @param data/*w w w .j ava 2 s .c o m*/ * @return */ private double entropy(Instances data) { /* Entropy = -(p1 log2 p1) -(p2 log2 p2).... */ double numInstance = data.numInstances(); double numClass = data.numClasses(); double[] distribution = new double[data.numClasses()]; Enumeration instance = data.enumerateInstances(); while (instance.hasMoreElements()) { Instance temp = (Instance) instance.nextElement(); /* Count the p1, p2 */ distribution[(int) temp.classValue()]++; } /* Sum all the distribution */ double sum = 0; for (int i = 0; i < numClass; i++) { distribution[i] = distribution[i] / numInstance; if (distribution[i] > 0.0) distribution[i] *= Utils.log2(distribution[i]); // System.out.println(Arrays.toString(distribution)); sum += distribution[i]; } return -1 * sum; }
From source file:myID3.MyId3.java
/** * Create split of data based on the value of attribute * @param data//from w ww . j av a 2s .co m * @param attribute * @return */ private Instances[] splitDataByAttribute(Instances data, Attribute attribute) { // Init the object first Instances[] subSet = new Instances[attribute.numValues()]; for (int i = 0; i < attribute.numValues(); i++) { subSet[i] = new Instances(data, data.numInstances()); } // Split it! Enumeration instanceEnum = data.enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); subSet[(int) instance.value(attribute)].add(instance); } // Compact the array of object by removing the empty array for (int i = 0; i < attribute.numValues(); i++) { subSet[i].compactify(); // System.out.println(subSet[i]); } return subSet; }
From source file:myid3andc45classifier.Model.MyC45.java
@Override public void buildClassifier(Instances data) throws Exception { getCapabilities().testWithFail(data); data = new Instances(data); data.deleteWithMissingClass();/*from ww w .j av a 2 s .co m*/ Enumeration enumAtt = data.enumerateAttributes(); while (enumAtt.hasMoreElements()) { Attribute attr = (Attribute) enumAtt.nextElement(); if (attr.isNumeric()) { ArrayList<Double> mid = new ArrayList<Double>(); Instances savedData = null; double temp, max = Double.NEGATIVE_INFINITY; // TODO: split nominal data.sort(attr); for (int i = 0; i < data.numInstances() - 1; i++) { if (data.instance(i).classValue() != data.instance(i + 1).classValue()) { if (data.attribute(attr.name() + " " + (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2) == null) { data = convertInstances(data, attr, (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2); //temp = computeInfoGainRatio(newData, newData.attribute(newData.numAttributes()-1)); //System.out.println("attribute "+newData.attribute(newData.numAttributes()-1).name()); //if (temp > max) { // max = temp; // savedData = newData; //} } } } //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); double mean = attributeStats.numericStats.mean; if (Double.isNaN(mean)) mean = 0; // Replace missing value with mean Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), mean); } } //data = new Instances(savedData); } else { //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); int maxIndex = 0; for (int i = 1; i < attr.numValues(); i++) { if (attributeStats.nominalCounts[maxIndex] < attributeStats.nominalCounts[i]) { maxIndex = i; } } // Replace missing value with max index Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), maxIndex); } } } } makeMyC45Tree(data); }
From source file:myid3andc45classifier.Model.MyC45.java
public void makeMyC45Tree(Instances data) throws Exception { if (data.numInstances() == 0) { attribute = null;/*from w w w .j av a 2s. com*/ label = Instance.missingValue(); return; } //System.out.println("NEW"); double[] infoGainRatios = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); if (!att.isNumeric()) infoGainRatios[att.index()] = computeInfoGainRatio(data, att); else infoGainRatios[att.index()] = Double.NEGATIVE_INFINITY; //System.out.println(att.name() + " " + infoGainRatios[att.index()]); } // TODO: build the tree attribute = data.attribute(maxIndex(infoGainRatios)); //System.out.println(infoGainRatios[maxIndex(infoGainRatios)]); // Make leaf if information gain is zero. // Otherwise create successors. if (infoGainRatios[maxIndex(infoGainRatios)] <= epsilon || Double.isNaN(infoGainRatios[maxIndex(infoGainRatios)])) { attribute = null; double[] numClasses = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); numClasses[(int) inst.classValue()]++; } label = maxIndex(numClasses); classAttribute = data.classAttribute(); } else { classAttribute = data.classAttribute(); Instances[] splitData = splitInstancesByAttribute(data, attribute); Instances[] distrData = splitInstancesByAttribute(data, data.classAttribute()); distribution = new double[distrData.length]; for (int j = 0; j < distribution.length; j++) { distribution[j] = distrData[j].numInstances(); } successors = new MyC45[attribute.numValues()]; for (int j = 0; j < attribute.numValues(); j++) { successors[j] = new MyC45(); successors[j].buildClassifier(splitData[j]); } } // TODO: prune //pruneTree(data); }
From source file:myid3andc45classifier.Model.MyC45.java
public double[] listClassCountsValues(Instances data) throws Exception { double[] classCounts = new double[data.numClasses()]; //array untuk menyimpan value kelas sesuai jumlah kelas Enumeration instanceEnum = data.enumerateInstances(); //Masukkan data ke array while (instanceEnum.hasMoreElements()) { Instance inst = (Instance) instanceEnum.nextElement(); classCounts[(int) inst.classValue()]++; }//from w ww . j ava 2s. co m return classCounts; }
From source file:myid3andc45classifier.Model.MyC45.java
public Instances[] splitInstancesByAttribute(Instances data, Attribute attr) throws Exception { //Split data menjadi beberapa instances sesuai dengan jumlah jenis data pada atribut Instances[] splitData = new Instances[attr.numValues()]; for (int i = 0; i < attr.numValues(); i++) { splitData[i] = new Instances(data, data.numInstances()); }/*from w w w. j a v a2s . c om*/ Enumeration instanceEnum = data.enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance inst = (Instance) instanceEnum.nextElement(); splitData[(int) inst.value(attr)].add(inst); } for (int i = 0; i < splitData.length; i++) { splitData[i].compactify(); } return splitData; }