Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package myclassifier.myC45Pack; import java.util.Enumeration; import weka.core.Instance; import weka.core.Instances; /** * * @author Fahmi */ public class ClassDistribution { /** Weight instances setiap class per subdataset. */ private double[][] w_perClassPerSubdataset; /** Weight instances setiap subdataset. */ double[] w_perSubdataset; /** Weight instances setiap class. */ private double w_perClass[]; /** Total weight instances. */ private double totalWeights; /** * Constructor distribution. */ public ClassDistribution(int numSubdataset, int numClasses) { w_perSubdataset = new double[numSubdataset]; w_perClass = new double[numClasses]; w_perClassPerSubdataset = new double[numSubdataset][numClasses]; for (int i = 0; i < numSubdataset; i++) { w_perClassPerSubdataset[i] = new double[numClasses]; } totalWeights = 0; } /** * Constructor distribution dengan satu dataset * @param dataSet * * @exception Exception if something goes wrong */ public ClassDistribution(Instances dataSet) throws Exception { w_perClassPerSubdataset = new double[1][dataSet.numClasses()]; w_perSubdataset = new double[1]; w_perClass = new double[dataSet.numClasses()]; totalWeights = 0; Enumeration E = dataSet.enumerateInstances(); while (E.hasMoreElements()) { Instance inst = (Instance) E.nextElement(); addInstance(0, inst); } } /** * Creates a distribution according to given instances and * split model. * * @exception Exception if something goes wrong */ public ClassDistribution(Instances source, C45ClassifierSplitModel modelToUse) throws Exception { int index; Instance instance; double[] weights; w_perClassPerSubdataset = new double[modelToUse.numSubsets()][0]; w_perSubdataset = new double[modelToUse.numSubsets()]; totalWeights = 0; w_perClass = new double[source.numClasses()]; for (int i = 0; i < modelToUse.numSubsets(); i++) { w_perClassPerSubdataset[i] = new double[source.numClasses()]; } Enumeration E = source.enumerateInstances(); while (E.hasMoreElements()) { instance = (Instance) E.nextElement(); index = modelToUse.getSubsetIndex(instance); if (index != -1) { addInstance(index, instance); } else { weights = modelToUse.getWeights(instance); addWeights(instance, weights); } } } /** * Constructor distribution dengan satu dataset */ public ClassDistribution(ClassDistribution sourceDist) { w_perClass = new double[sourceDist.getNumClasses()]; w_perClassPerSubdataset = new double[1][sourceDist.getNumClasses()]; for (int i = 0; i < sourceDist.getNumClasses(); i++) { w_perClassPerSubdataset[0][i] = sourceDist.w_perClass[i]; w_perClass[i] = sourceDist.w_perClass[i]; } totalWeights = sourceDist.totalWeights; w_perSubdataset = new double[1]; w_perSubdataset[0] = totalWeights; } /** * Creates distribution with two bags by merging all bags apart of * the indicated one. */ public ClassDistribution(ClassDistribution sourceDist, int index) { w_perClass = new double[sourceDist.getNumClasses()]; //System.arraycopy(sourceDist.w_perClass,0,w_perClass,0,sourceDist.numClasses()); w_perClassPerSubdataset = new double[2][0]; w_perClassPerSubdataset[0] = new double[sourceDist.getNumClasses()]; System.arraycopy(sourceDist.w_perClassPerSubdataset[index], 0, w_perClassPerSubdataset[0], 0, sourceDist.getNumClasses()); w_perClassPerSubdataset[1] = new double[sourceDist.getNumClasses()]; for (int i = 0; i < sourceDist.getNumClasses(); i++) { w_perClassPerSubdataset[1][i] = sourceDist.w_perClass[i] - w_perClassPerSubdataset[0][i]; w_perClass[i] = sourceDist.w_perClass[i]; } totalWeights = sourceDist.totalWeights; w_perSubdataset = new double[2]; w_perSubdataset[0] = sourceDist.w_perSubdataset[index]; w_perSubdataset[1] = totalWeights - w_perSubdataset[0]; } /** * Returns number of non-empty bags of distribution. */ public int getActualNumSubdataset() { int returnValue = 0; for (int i = 0; i < w_perSubdataset.length; i++) { if (w_perSubdataset[i] > 0) { returnValue++; } } return returnValue; } public int getNumSubdataset() { return w_perSubdataset.length; } /** * Returns number of classes actually occuring in distribution. */ public final int getActualNumClasses() { int returnValue = 0; for (int i = 0; i < w_perClass.length; i++) { if (w_perClass[i] > 0) { returnValue++; } } return returnValue; } /** * Returns number of classes actually occuring in given subdataset. */ public final int getActualNumClasses(int subdatasetIndex) { int returnValue = 0; int i; for (i = 0; i < w_perClass.length; i++) { if (w_perClassPerSubdataset[subdatasetIndex][i] > 0) { returnValue++; } } return returnValue; } /** * return the number of class in this distribution */ public int getNumClasses() { return w_perClass.length; } /** * Return the total weight of the class distribution */ public double getTotalWeight() { return totalWeights; } /** * Adds instance to subDataset. * * @exception Exception if something goes wrong */ public void addInstance(int subDatasetIndex, Instance instance) throws Exception { int classIndex = (int) instance.classValue(); double weight = instance.weight(); w_perClassPerSubdataset[subDatasetIndex][classIndex] = w_perClassPerSubdataset[subDatasetIndex][classIndex] + weight; w_perSubdataset[subDatasetIndex] = w_perSubdataset[subDatasetIndex] + weight; w_perClass[classIndex] = w_perClass[classIndex] + weight; totalWeights = totalWeights + weight; } /** * Subtracts given instance from given bag. * * @exception Exception if something goes wrong */ /*public final void sub(int bagIndex,Instance instance) throws Exception { int classIndex; double weight; classIndex = (int)instance.classValue(); weight = instance.weight(); m_perClassPerBag[bagIndex][classIndex] = m_perClassPerBag[bagIndex][classIndex]-weight; m_perBag[bagIndex] = m_perBag[bagIndex]-weight; m_perClass[classIndex] = m_perClass[classIndex]-weight; totaL = totaL-weight; }*/ /** * Adds counts to given bag. */ /*public final void add(int bagIndex, double[] counts) { double sum = Utils.sum(counts); for (int i = 0; i < counts.length; i++) m_perClassPerBag[bagIndex][i] += counts[i]; m_perBag[bagIndex] = m_perBag[bagIndex]+sum; for (int i = 0; i < counts.length; i++) m_perClass[i] = m_perClass[i]+counts[i]; totaL = totaL+sum; }*/ /** * Adds all instances with unknown values for given attribute, weighted * according to frequency of instances in each bag. * * @exception Exception if something goes wrong */ public void addInstWithMissValue(Instances dataSet, int attIndex) throws Exception { double[] valueProbs; double weight, newWeight; int classIndex; Instance instance; valueProbs = new double[w_perSubdataset.length]; for (int i = 0; i < w_perSubdataset.length; i++) { if (totalWeights == 0) { valueProbs[i] = 1.0 / valueProbs.length; } else { valueProbs[i] = w_perSubdataset[i] / totalWeights; } } Enumeration E = dataSet.enumerateInstances(); while (E.hasMoreElements()) { instance = (Instance) E.nextElement(); if (instance.isMissing(attIndex)) { classIndex = (int) instance.classValue(); weight = instance.weight(); w_perClass[classIndex] = w_perClass[classIndex] + weight; totalWeights += weight; for (int i = 0; i < w_perSubdataset.length; i++) { newWeight = valueProbs[i] * weight; w_perClassPerSubdataset[i][classIndex] += newWeight; w_perSubdataset[i] += newWeight; } } } } /** * Adds all instances in given range to given bag. * * @exception Exception if something goes wrong */ public final void addRange(int subDatasetIndex, Instances dataSet, int startIndex, int lastIndex) throws Exception { double sumOfWeights = 0; int classIndex; Instance data; for (int i = startIndex; i < lastIndex; i++) { data = (Instance) dataSet.instance(i); classIndex = (int) data.classValue(); sumOfWeights += data.weight(); w_perClassPerSubdataset[subDatasetIndex][classIndex] += data.weight(); w_perClass[classIndex] += data.weight(); } w_perSubdataset[subDatasetIndex] += sumOfWeights; totalWeights += sumOfWeights; } /** * Adds given instance to all bags weighting it according to given weights. * * @exception Exception if something goes wrong */ public void addWeights(Instance instance, double[] weights) throws Exception { int classIndex; classIndex = (int) instance.classValue(); for (int i = 0; i < w_perSubdataset.length; i++) { double weight = instance.weight() * weights[i]; w_perClassPerSubdataset[i][classIndex] += weight; w_perSubdataset[i] += weight; w_perClass[classIndex] += weight; totalWeights += weight; } } /** * Checks if at least two bags contain a minimum number of instances. */ public boolean isSplitable(double minInstance) { int counter = 0; for (int i = 0; i < w_perSubdataset.length; i++) { if (w_perSubdataset[i] >= minInstance) { counter++; } } return (counter > 1); } /** * Clones distribution (Deep copy of distribution). */ /*public final Object clone() { int i,j; Distribution newDistribution = new Distribution (m_perBag.length, m_perClass.length); for (i=0;i<m_perBag.length;i++) { newDistribution.m_perBag[i] = m_perBag[i]; for (j=0;j<m_perClass.length;j++) newDistribution.m_perClassPerBag[i][j] = m_perClassPerBag[i][j]; } for (j=0;j<m_perClass.length;j++) newDistribution.m_perClass[j] = m_perClass[j]; newDistribution.totaL = totaL; return newDistribution; } */ /** * Deletes given instance from given bag. * * @exception Exception if something goes wrong */ /*public final void del(int bagIndex,Instance instance) throws Exception { int classIndex; double weight; classIndex = (int)instance.classValue(); weight = instance.weight(); m_perClassPerBag[bagIndex][classIndex] = m_perClassPerBag[bagIndex][classIndex]-weight; m_perBag[bagIndex] = m_perBag[bagIndex]-weight; m_perClass[classIndex] = m_perClass[classIndex]-weight; totaL = totaL-weight; } */ /** * Deletes all instances in given range from given bag. * * @exception Exception if something goes wrong */ /*public final void delRange(int bagIndex,Instances source, int startIndex, int lastPlusOne) throws Exception { double sumOfWeights = 0; int classIndex; Instance instance; int i; for (i = startIndex; i < lastPlusOne; i++) { instance = (Instance) source.instance(i); classIndex = (int)instance.classValue(); sumOfWeights = sumOfWeights+instance.weight(); m_perClassPerBag[bagIndex][classIndex] -= instance.weight(); m_perClass[classIndex] -= instance.weight(); } m_perBag[bagIndex] -= sumOfWeights; totaL -= sumOfWeights; } */ /** * Prints distribution. */ /* public final String dumpDistribution() { StringBuffer text; int i,j; text = new StringBuffer(); for (i=0;i<m_perBag.length;i++) { text.append("Bag num "+i+"\n"); for (j=0;j<m_perClass.length;j++) text.append("Class num "+j+" "+m_perClassPerBag[i][j]+"\n"); } return text.toString(); } */ /** * Sets all counts to zero. */ /* public final void initialize() { for (int i = 0; i < m_perClass.length; i++) m_perClass[i] = 0; for (int i = 0; i < m_perBag.length; i++) m_perBag[i] = 0; for (int i = 0; i < m_perBag.length; i++) for (int j = 0; j < m_perClass.length; j++) m_perClassPerBag[i][j] = 0; totaL = 0; } */ /** * Returns matrix with distribution of class values. */ /*public final double[][] matrix() { return m_perClassPerBag; }*/ /** * Returns index of bag containing maximum number of instances. */ /*public final int maxBag() { double max; int maxIndex; int i; max = 0; maxIndex = -1; for (i=0;i<m_perBag.length;i++) if (Utils.grOrEq(m_perBag[i],max)) { max = m_perBag[i]; maxIndex = i; } return maxIndex; }*/ /** * Returns class with highest frequency over all bags. */ public final int maxClass() { double max = 0; int maxIndex = 0; for (int i = 0; i < w_perClass.length; i++) { if (w_perClass[i] > max) { max = w_perClass[i]; maxIndex = i; } } return maxIndex; } /** * Returns class with highest frequency for given subdatas. */ public final int maxClass(int subDatasetIndex) { double max = 0; int maxIndex = 0; int i; if (w_perSubdataset[subDatasetIndex] > 0) { for (i = 0; i < w_perClass.length; i++) if (w_perClassPerSubdataset[subDatasetIndex][i] > max) { max = w_perClassPerSubdataset[subDatasetIndex][i]; maxIndex = i; } return maxIndex; } else return maxClass(); } /** * Returns perClass(maxClass()). */ public final double numCorrect() { return w_perClass[maxClass()]; } /** * Returns perClassPerSubdataset(index,maxClass(index)). */ public final double numCorrect(int index) { return w_perClassPerSubdataset[index][maxClass(index)]; } /** * Returns total-numCorrect(). */ public final double numIncorrect() { return totalWeights - numCorrect(); } /** * Returns perBag(index)-numCorrect(index). */ public final double numIncorrect(int index) { return w_perSubdataset[index] - numCorrect(index); } /** * Returns number of (possibly fractional) instances of given class in * given bag. */ /*public final double perClassPerBag(int bagIndex, int classIndex) { return m_perClassPerBag[bagIndex][classIndex]; }*/ /** * Returns number of (possibly fractional) instances in given bag. */ /*public final double perBag(int bagIndex) { return m_perBag[bagIndex]; }*/ /** * Returns number of (possibly fractional) instances of given class. */ /*public final double perClass(int classIndex) { return m_perClass[classIndex]; }*/ /** * Returns relative frequency of class over all bags with * Laplace correction. */ public double laplaceProb(int classIndex) { return (w_perClass[classIndex] + 1) / (totalWeights + (double) w_perClass.length); } /** * Returns relative frequency of class for given bag. */ public double laplaceProb(int classIndex, int intIndex) { if (w_perSubdataset[intIndex] > 0) { return (w_perClassPerSubdataset[intIndex][classIndex] + 1.0) / (w_perSubdataset[intIndex] + (double) w_perClass.length); } else { return laplaceProb(classIndex); } } /** * Returns probability of a class */ public double prob(int classIndex) { if (totalWeights == 0) { return w_perClass[classIndex] / totalWeights; } else { return 0; } } /** * Returns relative frequency of class for given subDataset. */ public double prob(int classIndex, int subDatasetIndex) { if (w_perSubdataset[subDatasetIndex] > 0) { return w_perClassPerSubdataset[subDatasetIndex][classIndex] / w_perSubdataset[subDatasetIndex]; } else { return prob(classIndex); } } /** * Subtracts the given distribution from this one. The results * has only one bag. */ /*public final Distribution subtract(Distribution toSubstract) { Distribution newDist = new Distribution(1,m_perClass.length); newDist.m_perBag[0] = totaL-toSubstract.totaL; newDist.totaL = newDist.m_perBag[0]; for (int i = 0; i < m_perClass.length; i++) { newDist.m_perClassPerBag[0][i] = m_perClass[i] - toSubstract.m_perClass[i]; newDist.m_perClass[i] = newDist.m_perClassPerBag[0][i]; } return newDist; }*/ /** * Shifts given instance from one bag to another one. * * @exception Exception if something goes wrong */ /*public final void shift(int from,int to,Instance instance) throws Exception { int classIndex; double weight; classIndex = (int)instance.classValue(); weight = instance.weight(); m_perClassPerBag[from][classIndex] -= weight; m_perClassPerBag[to][classIndex] += weight; m_perBag[from] -= weight; m_perBag[to] += weight; } */ /** * Move instances in given range from one bag to another one. * * @exception Exception if something goes wrong */ public void moveInstancesWithRange(int from, int to, Instances dataSet, int startIndex, int lastIndex) throws Exception { int classIndex; double weight; Instance data; for (int i = startIndex; i < lastIndex; i++) { data = (Instance) dataSet.instance(i); classIndex = (int) data.classValue(); weight = data.weight(); w_perClassPerSubdataset[from][classIndex] -= weight; w_perClassPerSubdataset[to][classIndex] += weight; w_perSubdataset[from] -= weight; w_perSubdataset[to] += weight; } } /** * Mengembalikan hasil dari log2 */ public double log2(double num) { // Constant hard coded for efficiency reasons if (num < 1e-6) return 0; else return num * Math.log(num) / Math.log(2); } /** * Menghitung entropi */ private double calcInitialEntropy() { double initEntropy = 0; for (int i = 0; i < getNumClasses(); i++) { double p = w_perClass[i] / totalWeights; initEntropy = initEntropy + (p * log2(p)); } return -initEntropy; } public double calculateInfoGain(double instancesTotalWeight) { /* initial entropy */ /* entropy = -(p1 * log2 p1 + p2 * log2 p2 + ...) */ double initialEntropy = 0; double unknownValues = 0; double unknownRate = 0; initialEntropy = calcInitialEntropy(); for (int i = 0; i < getNumSubdataset(); i++) { double finalEntropy = 0; for (int j = 0; j < getNumClasses(); j++) { double p = 0; if (w_perSubdataset[i] > 0) { p = w_perClassPerSubdataset[i][j] / w_perSubdataset[i]; } finalEntropy = finalEntropy + (p * log2(p)); } finalEntropy = -1 * finalEntropy; initialEntropy = initialEntropy - (w_perSubdataset[i] / totalWeights * finalEntropy); } unknownValues = instancesTotalWeight - totalWeights; unknownRate = unknownValues / instancesTotalWeight; return ((1 - unknownRate) * initialEntropy); } /** * Menghitung gain ratio * @param infoGain * @return */ public double calculateGainRatio(double infoGain) { double splitInformation = 0; double gainRatio; for (int i = 0; i < getNumSubdataset(); i++) { double p = w_perSubdataset[i] / totalWeights; splitInformation = splitInformation - (p * log2(p)); } gainRatio = infoGain / splitInformation; return gainRatio; } }