Example usage for weka.core ContingencyTables entropyConditionedOnRows

List of usage examples for weka.core ContingencyTables entropyConditionedOnRows

Introduction

In this page you can find the example usage for weka.core ContingencyTables entropyConditionedOnRows.

Prototype

public static double entropyConditionedOnRows(double[][] matrix) 

Source Link

Document

Computes conditional entropy of the columns given the rows.

Usage

From source file:boosting.classifiers.DecisionStumpWritable.java

License:Open Source License

/**
 * Finds best split for nominal attribute and nominal class
 * and returns value./*from ww  w.  j  a v a2s  .c  om*/
 *
 * @param index attribute index
 * @return value of criterion for the best split
 * @throws Exception if something goes wrong
 */
private double findSplitNominalNominal(int index) throws Exception {

    double bestVal = Double.MAX_VALUE, currVal;
    double[][] counts = new double[m_Instances.attribute(index).numValues() + 1][m_Instances.numClasses()];
    double[] sumCounts = new double[m_Instances.numClasses()];
    double[][] bestDist = new double[3][m_Instances.numClasses()];
    int numMissing = 0;

    // Compute counts for all the values
    for (int i = 0; i < m_Instances.numInstances(); i++) {
        Instance inst = m_Instances.instance(i);
        if (inst.isMissing(index)) {
            numMissing++;
            counts[m_Instances.attribute(index).numValues()][(int) inst.classValue()] += inst.weight();
        } else {
            counts[(int) inst.value(index)][(int) inst.classValue()] += inst.weight();
        }
    }

    // Compute sum of counts
    for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) {
        for (int j = 0; j < m_Instances.numClasses(); j++) {
            sumCounts[j] += counts[i][j];
        }
    }

    // Make split counts for each possible split and evaluate
    System.arraycopy(counts[m_Instances.attribute(index).numValues()], 0, m_Distribution[2], 0,
            m_Instances.numClasses());
    for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) {
        for (int j = 0; j < m_Instances.numClasses(); j++) {
            m_Distribution[0][j] = counts[i][j];
            m_Distribution[1][j] = sumCounts[j] - counts[i][j];
        }
        currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution);
        if (currVal < bestVal) {
            bestVal = currVal;
            m_SplitPoint = (double) i;
            for (int j = 0; j < 3; j++) {
                System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses());
            }
        }
    }

    // No missing values in training data.
    if (numMissing == 0) {
        System.arraycopy(sumCounts, 0, bestDist[2], 0, m_Instances.numClasses());
    }

    m_Distribution = bestDist;
    return bestVal;
}

From source file:boosting.classifiers.DecisionStumpWritable.java

License:Open Source License

/**
 * Finds best split for numeric attribute and nominal class
 * and returns value./*from   www .  java 2s .  co m*/
 *
 * @param index attribute index
 * @return value of criterion for the best split
 * @throws Exception if something goes wrong
 */
private double findSplitNumericNominal(int index) throws Exception {

    double bestVal = Double.MAX_VALUE, currVal, currCutPoint;
    int numMissing = 0;
    double[] sum = new double[m_Instances.numClasses()];
    double[][] bestDist = new double[3][m_Instances.numClasses()];

    // Compute counts for all the values
    for (int i = 0; i < m_Instances.numInstances(); i++) {
        Instance inst = m_Instances.instance(i);
        if (!inst.isMissing(index)) {
            m_Distribution[1][(int) inst.classValue()] += inst.weight();
        } else {
            m_Distribution[2][(int) inst.classValue()] += inst.weight();
            numMissing++;
        }
    }
    System.arraycopy(m_Distribution[1], 0, sum, 0, m_Instances.numClasses());

    // Save current distribution as best distribution
    for (int j = 0; j < 3; j++) {
        System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses());
    }

    // Sort instances
    m_Instances.sort(index);

    // Make split counts for each possible split and evaluate
    for (int i = 0; i < m_Instances.numInstances() - (numMissing + 1); i++) {
        Instance inst = m_Instances.instance(i);
        Instance instPlusOne = m_Instances.instance(i + 1);
        m_Distribution[0][(int) inst.classValue()] += inst.weight();
        m_Distribution[1][(int) inst.classValue()] -= inst.weight();
        if (inst.value(index) < instPlusOne.value(index)) {
            currCutPoint = (inst.value(index) + instPlusOne.value(index)) / 2.0;
            currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution);
            if (currVal < bestVal) {
                m_SplitPoint = currCutPoint;
                bestVal = currVal;
                for (int j = 0; j < 3; j++) {
                    System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses());
                }
            }
        }
    }

    // No missing values in training data.
    if (numMissing == 0) {
        System.arraycopy(sum, 0, bestDist[2], 0, m_Instances.numClasses());
    }

    m_Distribution = bestDist;
    return bestVal;
}

From source file:feature.InfoGainEval.java

License:Open Source License

/**
 * Initializes an information gain attribute evaluator. Discretizes all
 * attributes that are numeric./*  w ww .j  av a 2 s.  c  om*/
 *
 * @param data
 *            set of instances serving as training data
 * @throws Exception
 *             if the evaluator has not been generated successfully
 */
public double computeInfoGain(Instances data, int att) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    int classIndex = data.classIndex();
    int numInstances = data.numInstances();

    if (!m_Binarize) {
        Discretize disTransform = new Discretize();
        disTransform.setUseBetterEncoding(true);
        disTransform.setInputFormat(data);
        data = Filter.useFilter(data, disTransform);
    } else {
        NumericToBinary binTransform = new NumericToBinary();
        binTransform.setInputFormat(data);
        data = Filter.useFilter(data, binTransform);
    }
    int numClasses = data.attribute(classIndex).numValues();

    // Reserve space and initialize counters
    double[][][] counts = new double[data.numAttributes()][][];
    for (int k = 0; k < data.numAttributes(); k++) {
        if (k != classIndex) {
            int numValues = data.attribute(k).numValues();
            counts[k] = new double[numValues + 1][numClasses + 1];
        }
    }

    // Initialize counters
    double[] temp = new double[numClasses + 1];
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        if (inst.classIsMissing()) {
            temp[numClasses] += inst.weight();
        } else {
            temp[(int) inst.classValue()] += inst.weight();
        }
    }
    for (int k = 0; k < counts.length; k++) {
        if (k != classIndex) {
            for (int i = 0; i < temp.length; i++) {
                counts[k][0][i] = temp[i];
            }
        }
    }

    // Get counts
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        for (int i = 0; i < inst.numValues(); i++) {
            if (inst.index(i) != classIndex) {
                if (inst.isMissingSparse(i) || inst.classIsMissing()) {
                    if (!inst.isMissingSparse(i)) {
                        counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    } else if (!inst.classIsMissing()) {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst
                                .classValue()] += inst.weight();
                        counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                    } else {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst
                                .weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    }
                } else {
                    counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight();
                    counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                }
            }
        }
    }

    // distribute missing counts if required
    if (m_missing_merge) {

        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != classIndex) {
                int numValues = data.attribute(k).numValues();

                // Compute marginals
                double[] rowSums = new double[numValues];
                double[] columnSums = new double[numClasses];
                double sum = 0;
                for (int i = 0; i < numValues; i++) {
                    for (int j = 0; j < numClasses; j++) {
                        rowSums[i] += counts[k][i][j];
                        columnSums[j] += counts[k][i][j];
                    }
                    sum += rowSums[i];
                }

                if (Utils.gr(sum, 0)) {
                    double[][] additions = new double[numValues][numClasses];

                    // Compute what needs to be added to each row
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
                        }
                    }

                    // Compute what needs to be added to each column
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses];
                        }
                    }

                    // Compute what needs to be added to each cell
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses];
                        }
                    }

                    // Make new contingency table
                    double[][] newTable = new double[numValues][numClasses];
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            newTable[i][j] = counts[k][i][j] + additions[i][j];
                        }
                    }
                    counts[k] = newTable;
                }
            }
        }
    }

    // Compute info gains
    m_InfoGains = new double[data.numAttributes()];
    m_InfoGains[att] = (ContingencyTables.entropyOverColumns(counts[att])
            - ContingencyTables.entropyConditionedOnRows(counts[att]));

    return m_InfoGains[att];
}

From source file:feature.InfoGainEval.java

License:Open Source License

public void buildEvaluator(Instances data) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    int classIndex = data.classIndex();
    int numInstances = data.numInstances();

    if (!m_Binarize) {
        Discretize disTransform = new Discretize();
        disTransform.setUseBetterEncoding(true);
        disTransform.setInputFormat(data);
        data = Filter.useFilter(data, disTransform);
    } else {/* w  w  w .j  a v a2s  .  c om*/
        NumericToBinary binTransform = new NumericToBinary();
        binTransform.setInputFormat(data);
        data = Filter.useFilter(data, binTransform);
    }
    int numClasses = data.attribute(classIndex).numValues();

    // Reserve space and initialize counters
    double[][][] counts = new double[data.numAttributes()][][];
    for (int k = 0; k < data.numAttributes(); k++) {
        if (k != classIndex) {
            int numValues = data.attribute(k).numValues();
            counts[k] = new double[numValues + 1][numClasses + 1];
        }
    }

    // Initialize counters
    double[] temp = new double[numClasses + 1];
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        if (inst.classIsMissing()) {
            temp[numClasses] += inst.weight();
        } else {
            temp[(int) inst.classValue()] += inst.weight();
        }
    }
    for (int k = 0; k < counts.length; k++) {
        if (k != classIndex) {
            for (int i = 0; i < temp.length; i++) {
                counts[k][0][i] = temp[i];
            }
        }
    }

    // Get counts
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        for (int i = 0; i < inst.numValues(); i++) {
            if (inst.index(i) != classIndex) {
                if (inst.isMissingSparse(i) || inst.classIsMissing()) {
                    if (!inst.isMissingSparse(i)) {
                        counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    } else if (!inst.classIsMissing()) {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst
                                .classValue()] += inst.weight();
                        counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                    } else {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst
                                .weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    }
                } else {
                    counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight();
                    counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                }
            }
        }
    }

    // distribute missing counts if required
    if (m_missing_merge) {

        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != classIndex) {
                int numValues = data.attribute(k).numValues();

                // Compute marginals
                double[] rowSums = new double[numValues];
                double[] columnSums = new double[numClasses];
                double sum = 0;
                for (int i = 0; i < numValues; i++) {
                    for (int j = 0; j < numClasses; j++) {
                        rowSums[i] += counts[k][i][j];
                        columnSums[j] += counts[k][i][j];
                    }
                    sum += rowSums[i];
                }

                if (Utils.gr(sum, 0)) {
                    double[][] additions = new double[numValues][numClasses];

                    // Compute what needs to be added to each row
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
                        }
                    }

                    // Compute what needs to be added to each column
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses];
                        }
                    }

                    // Compute what needs to be added to each cell
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses];
                        }
                    }

                    // Make new contingency table
                    double[][] newTable = new double[numValues][numClasses];
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            newTable[i][j] = counts[k][i][j] + additions[i][j];
                        }
                    }
                    counts[k] = newTable;
                }
            }
        }
    }

    // Compute info gains
    m_InfoGains = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
        if (i != classIndex) {
            m_InfoGains[i] = (ContingencyTables.entropyOverColumns(counts[i])
                    - ContingencyTables.entropyConditionedOnRows(counts[i]));
        }
    }
}

From source file:moa.reduction.bayes.IncrInfoThAttributeEval.java

License:Open Source License

@Override
/**/*  w  ww .  j  ava2s  . com*/
 * Update the contingency tables and the rankings for each features using the counters.
 * Counters are updated in each iteration.
 */
public void applySelection() {
    if (counts != null && updated) {
        m_InfoValues = new double[counts.length];
        for (int i = 0; i < counts.length; i++) {
            if (i != classIndex) {
                Set<Key> keys = counts[i].keySet();
                Set<Entry<Key, Float>> entries = counts[i].entrySet();

                Set<Float> avalues = new HashSet<Float>();
                Set<Float> cvalues = new HashSet<Float>();
                for (Iterator<Key> it = keys.iterator(); it.hasNext();) {
                    Key key = it.next();
                    avalues.add(key.x);
                    cvalues.add(key.y);
                }

                Map<Float, Integer> apos = new HashMap<Float, Integer>();
                Map<Float, Integer> cpos = new HashMap<Float, Integer>();

                int aidx = 0;
                for (Iterator<Float> it = avalues.iterator(); it.hasNext();) {
                    Float f = it.next();
                    apos.put(f, aidx++);
                }

                int cidx = 0;
                for (Iterator<Float> it = cvalues.iterator(); it.hasNext();) {
                    Float f = it.next();
                    cpos.put(f, cidx++);
                }

                double[][] lcounts = new double[avalues.size()][cvalues.size()];
                for (Iterator<Entry<Key, Float>> it = entries.iterator(); it.hasNext();) {
                    Entry<Key, Float> entry = it.next();
                    lcounts[apos.get(entry.getKey().x)][cpos.get(entry.getKey().y)] = entry.getValue();
                }

                switch (method) {
                case 1:
                    m_InfoValues[i] = ContingencyTables.symmetricalUncertainty(lcounts);
                    break;

                default:
                    m_InfoValues[i] = (ContingencyTables.entropyOverColumns(lcounts)
                            - ContingencyTables.entropyConditionedOnRows(lcounts));
                    break;
                }
            }
        }
        //System.out.println("Attribute values: " + Arrays.toString(m_InfoValues));
        updated = false;
    }
}

From source file:moa.reduction.bayes.PIDdiscretize.java

License:Open Source License

private double[] cutPointsForSubset(int attIndex, int first, int lastPlusOne) {

    //Map<Integer, Double> counts, bestCounts;
    double[] left, right, cutPoints;
    //double step = ((float) totalCount) / m_CutPointsL1.get(index).size();
    double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1, currentEntropy, bestEntropy, priorEntropy,
            gain;//w  w w .  j a v  a  2s .  c om
    int bestIndex = -1, numCutPoints = 0;
    double numInstances = 0;

    // Compute number of instances in set
    if ((lastPlusOne - first) < 2) {
        return null;
    }

    // Get the greatest class observed till here
    int numClasses = 0;
    for (int i = first; i < lastPlusOne; i++) {
        Map<Integer, Float> classDist = m_Distrib.get(attIndex).get(i);
        for (Integer key : classDist.keySet()) {
            if (key > numClasses) {
                numClasses = key;
            }
        }
    }
    numClasses += 1;

    // Compute class counts.
    double[][] counts = new double[2][numClasses];
    for (int i = first; i < lastPlusOne; i++) {
        Map<Integer, Float> classDist = m_Distrib.get(attIndex).get(i);
        for (Map.Entry<Integer, Float> entry : classDist.entrySet()) {
            counts[1][entry.getKey()] += entry.getValue();
            numInstances += entry.getValue();
        }
    }

    // Save prior counts
    double[] priorCounts = new double[numClasses];
    System.arraycopy(counts[1], 0, priorCounts, 0, numClasses);

    // Entropy of the full set
    priorEntropy = ContingencyTables.entropy(priorCounts);
    bestEntropy = priorEntropy;

    priorEntropy = ContingencyTables.entropy(priorCounts);
    bestEntropy = priorEntropy;

    // Find best entropy.
    double[][] bestCounts = new double[2][numClasses];
    for (int i = first; i < (lastPlusOne - 1); i++) {
        Map<Integer, Float> classDist = m_Distrib.get(attIndex).get(i);
        for (Map.Entry<Integer, Float> entry : classDist.entrySet()) {
            counts[0][entry.getKey()] += entry.getValue();
            counts[1][entry.getKey()] -= entry.getValue();
        }
        currentCutPoint = m_CutPointsL1.get(attIndex).get(i);
        currentEntropy = ContingencyTables.entropyConditionedOnRows(counts);
        if (currentEntropy < bestEntropy) {
            bestCutPoint = currentCutPoint;
            bestEntropy = currentEntropy;
            bestIndex = i;
            System.arraycopy(counts[0], 0, bestCounts[0], 0, numClasses);
            System.arraycopy(counts[1], 0, bestCounts[1], 0, numClasses);
        }
        numCutPoints++;
    }

    // Use worse encoding?
    if (!m_UseBetterEncoding) {
        numCutPoints = (lastPlusOne - first) - 1;
    }

    // Checks if gain is zero
    gain = priorEntropy - bestEntropy;

    if (gain <= 0) {
        return null;
    }

    // Check if split is to be accepted
    if (FayyadAndIranisMDL(priorCounts, bestCounts, numInstances, numCutPoints)) {

        // Select split points for the left and right subsets
        left = cutPointsForSubset(attIndex, first, bestIndex + 1);
        right = cutPointsForSubset(attIndex, bestIndex + 1, lastPlusOne);

        // Merge cut points and return them
        if ((left == null) && (right) == null) {
            cutPoints = new double[1];
            cutPoints[0] = bestCutPoint;
        } else if (right == null) {
            cutPoints = new double[left.length + 1];
            System.arraycopy(left, 0, cutPoints, 0, left.length);
            cutPoints[left.length] = bestCutPoint;
        } else if (left == null) {
            cutPoints = new double[1 + right.length];
            cutPoints[0] = bestCutPoint;
            System.arraycopy(right, 0, cutPoints, 1, right.length);
        } else {
            cutPoints = new double[left.length + right.length + 1];
            System.arraycopy(left, 0, cutPoints, 0, left.length);
            cutPoints[left.length] = bestCutPoint;
            System.arraycopy(right, 0, cutPoints, left.length + 1, right.length);
        }

        return cutPoints;
    } else {
        return null;
    }
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Computes value of splitting criterion after split.
 * // w ww  .  j  a v a 2s. co  m
 * @param dist
 *            the distributions
 * @param priorVal
 *            the splitting criterion
 * @return the gain after the split
 */
protected double gain(double[][] dist, double priorVal) {

    return priorVal - ContingencyTables.entropyConditionedOnRows(dist);
}