List of usage examples for weka.core ContingencyTables entropyConditionedOnRows
public static double entropyConditionedOnRows(double[][] matrix)
From source file:boosting.classifiers.DecisionStumpWritable.java
License:Open Source License
/** * Finds best split for nominal attribute and nominal class * and returns value./*from ww w. j a v a2s .c om*/ * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ private double findSplitNominalNominal(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal; double[][] counts = new double[m_Instances.attribute(index).numValues() + 1][m_Instances.numClasses()]; double[] sumCounts = new double[m_Instances.numClasses()]; double[][] bestDist = new double[3][m_Instances.numClasses()]; int numMissing = 0; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (inst.isMissing(index)) { numMissing++; counts[m_Instances.attribute(index).numValues()][(int) inst.classValue()] += inst.weight(); } else { counts[(int) inst.value(index)][(int) inst.classValue()] += inst.weight(); } } // Compute sum of counts for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { sumCounts[j] += counts[i][j]; } } // Make split counts for each possible split and evaluate System.arraycopy(counts[m_Instances.attribute(index).numValues()], 0, m_Distribution[2], 0, m_Instances.numClasses()); for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { m_Distribution[0][j] = counts[i][j]; m_Distribution[1][j] = sumCounts[j] - counts[i][j]; } currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution); if (currVal < bestVal) { bestVal = currVal; m_SplitPoint = (double) i; for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } } } // No missing values in training data. if (numMissing == 0) { System.arraycopy(sumCounts, 0, bestDist[2], 0, m_Instances.numClasses()); } m_Distribution = bestDist; return bestVal; }
From source file:boosting.classifiers.DecisionStumpWritable.java
License:Open Source License
/** * Finds best split for numeric attribute and nominal class * and returns value./*from www . java 2s . co m*/ * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ private double findSplitNumericNominal(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal, currCutPoint; int numMissing = 0; double[] sum = new double[m_Instances.numClasses()]; double[][] bestDist = new double[3][m_Instances.numClasses()]; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (!inst.isMissing(index)) { m_Distribution[1][(int) inst.classValue()] += inst.weight(); } else { m_Distribution[2][(int) inst.classValue()] += inst.weight(); numMissing++; } } System.arraycopy(m_Distribution[1], 0, sum, 0, m_Instances.numClasses()); // Save current distribution as best distribution for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } // Sort instances m_Instances.sort(index); // Make split counts for each possible split and evaluate for (int i = 0; i < m_Instances.numInstances() - (numMissing + 1); i++) { Instance inst = m_Instances.instance(i); Instance instPlusOne = m_Instances.instance(i + 1); m_Distribution[0][(int) inst.classValue()] += inst.weight(); m_Distribution[1][(int) inst.classValue()] -= inst.weight(); if (inst.value(index) < instPlusOne.value(index)) { currCutPoint = (inst.value(index) + instPlusOne.value(index)) / 2.0; currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution); if (currVal < bestVal) { m_SplitPoint = currCutPoint; bestVal = currVal; for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } } } } // No missing values in training data. if (numMissing == 0) { System.arraycopy(sum, 0, bestDist[2], 0, m_Instances.numClasses()); } m_Distribution = bestDist; return bestVal; }
From source file:feature.InfoGainEval.java
License:Open Source License
/** * Initializes an information gain attribute evaluator. Discretizes all * attributes that are numeric./* w ww .j av a 2 s. c om*/ * * @param data * set of instances serving as training data * @throws Exception * if the evaluator has not been generated successfully */ public double computeInfoGain(Instances data, int att) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute info gains m_InfoGains = new double[data.numAttributes()]; m_InfoGains[att] = (ContingencyTables.entropyOverColumns(counts[att]) - ContingencyTables.entropyConditionedOnRows(counts[att])); return m_InfoGains[att]; }
From source file:feature.InfoGainEval.java
License:Open Source License
public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else {/* w w w .j a v a2s . c om*/ NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute info gains m_InfoGains = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_InfoGains[i] = (ContingencyTables.entropyOverColumns(counts[i]) - ContingencyTables.entropyConditionedOnRows(counts[i])); } } }
From source file:moa.reduction.bayes.IncrInfoThAttributeEval.java
License:Open Source License
@Override /**/* w ww . j ava2s . com*/ * Update the contingency tables and the rankings for each features using the counters. * Counters are updated in each iteration. */ public void applySelection() { if (counts != null && updated) { m_InfoValues = new double[counts.length]; for (int i = 0; i < counts.length; i++) { if (i != classIndex) { Set<Key> keys = counts[i].keySet(); Set<Entry<Key, Float>> entries = counts[i].entrySet(); Set<Float> avalues = new HashSet<Float>(); Set<Float> cvalues = new HashSet<Float>(); for (Iterator<Key> it = keys.iterator(); it.hasNext();) { Key key = it.next(); avalues.add(key.x); cvalues.add(key.y); } Map<Float, Integer> apos = new HashMap<Float, Integer>(); Map<Float, Integer> cpos = new HashMap<Float, Integer>(); int aidx = 0; for (Iterator<Float> it = avalues.iterator(); it.hasNext();) { Float f = it.next(); apos.put(f, aidx++); } int cidx = 0; for (Iterator<Float> it = cvalues.iterator(); it.hasNext();) { Float f = it.next(); cpos.put(f, cidx++); } double[][] lcounts = new double[avalues.size()][cvalues.size()]; for (Iterator<Entry<Key, Float>> it = entries.iterator(); it.hasNext();) { Entry<Key, Float> entry = it.next(); lcounts[apos.get(entry.getKey().x)][cpos.get(entry.getKey().y)] = entry.getValue(); } switch (method) { case 1: m_InfoValues[i] = ContingencyTables.symmetricalUncertainty(lcounts); break; default: m_InfoValues[i] = (ContingencyTables.entropyOverColumns(lcounts) - ContingencyTables.entropyConditionedOnRows(lcounts)); break; } } } //System.out.println("Attribute values: " + Arrays.toString(m_InfoValues)); updated = false; } }
From source file:moa.reduction.bayes.PIDdiscretize.java
License:Open Source License
private double[] cutPointsForSubset(int attIndex, int first, int lastPlusOne) { //Map<Integer, Double> counts, bestCounts; double[] left, right, cutPoints; //double step = ((float) totalCount) / m_CutPointsL1.get(index).size(); double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1, currentEntropy, bestEntropy, priorEntropy, gain;//w w w . j a v a 2s . c om int bestIndex = -1, numCutPoints = 0; double numInstances = 0; // Compute number of instances in set if ((lastPlusOne - first) < 2) { return null; } // Get the greatest class observed till here int numClasses = 0; for (int i = first; i < lastPlusOne; i++) { Map<Integer, Float> classDist = m_Distrib.get(attIndex).get(i); for (Integer key : classDist.keySet()) { if (key > numClasses) { numClasses = key; } } } numClasses += 1; // Compute class counts. double[][] counts = new double[2][numClasses]; for (int i = first; i < lastPlusOne; i++) { Map<Integer, Float> classDist = m_Distrib.get(attIndex).get(i); for (Map.Entry<Integer, Float> entry : classDist.entrySet()) { counts[1][entry.getKey()] += entry.getValue(); numInstances += entry.getValue(); } } // Save prior counts double[] priorCounts = new double[numClasses]; System.arraycopy(counts[1], 0, priorCounts, 0, numClasses); // Entropy of the full set priorEntropy = ContingencyTables.entropy(priorCounts); bestEntropy = priorEntropy; priorEntropy = ContingencyTables.entropy(priorCounts); bestEntropy = priorEntropy; // Find best entropy. double[][] bestCounts = new double[2][numClasses]; for (int i = first; i < (lastPlusOne - 1); i++) { Map<Integer, Float> classDist = m_Distrib.get(attIndex).get(i); for (Map.Entry<Integer, Float> entry : classDist.entrySet()) { counts[0][entry.getKey()] += entry.getValue(); counts[1][entry.getKey()] -= entry.getValue(); } currentCutPoint = m_CutPointsL1.get(attIndex).get(i); currentEntropy = ContingencyTables.entropyConditionedOnRows(counts); if (currentEntropy < bestEntropy) { bestCutPoint = currentCutPoint; bestEntropy = currentEntropy; bestIndex = i; System.arraycopy(counts[0], 0, bestCounts[0], 0, numClasses); System.arraycopy(counts[1], 0, bestCounts[1], 0, numClasses); } numCutPoints++; } // Use worse encoding? if (!m_UseBetterEncoding) { numCutPoints = (lastPlusOne - first) - 1; } // Checks if gain is zero gain = priorEntropy - bestEntropy; if (gain <= 0) { return null; } // Check if split is to be accepted if (FayyadAndIranisMDL(priorCounts, bestCounts, numInstances, numCutPoints)) { // Select split points for the left and right subsets left = cutPointsForSubset(attIndex, first, bestIndex + 1); right = cutPointsForSubset(attIndex, bestIndex + 1, lastPlusOne); // Merge cut points and return them if ((left == null) && (right) == null) { cutPoints = new double[1]; cutPoints[0] = bestCutPoint; } else if (right == null) { cutPoints = new double[left.length + 1]; System.arraycopy(left, 0, cutPoints, 0, left.length); cutPoints[left.length] = bestCutPoint; } else if (left == null) { cutPoints = new double[1 + right.length]; cutPoints[0] = bestCutPoint; System.arraycopy(right, 0, cutPoints, 1, right.length); } else { cutPoints = new double[left.length + right.length + 1]; System.arraycopy(left, 0, cutPoints, 0, left.length); cutPoints[left.length] = bestCutPoint; System.arraycopy(right, 0, cutPoints, left.length + 1, right.length); } return cutPoints; } else { return null; } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes value of splitting criterion after split. * // w ww . j a v a 2s. co m * @param dist * the distributions * @param priorVal * the splitting criterion * @return the gain after the split */ protected double gain(double[][] dist, double priorVal) { return priorVal - ContingencyTables.entropyConditionedOnRows(dist); }