Example usage for weka.core Instances instance

Introduction

In this page you can find the example usage for weka.core Instances instance.

Prototype



publicInstance instance(int index)

Source Link

Document

Returns the instance at the given position.

Usage

From source file:j48.BinC45Split.java

License:Open Source License

/**
 * Sets distribution associated with model.
 *//*from  w w  w  . ja  va2  s.  c o  m*/
public void resetDistribution(Instances data) throws Exception {

    Instances insts = new Instances(data, data.numInstances());
    for (int i = 0; i < data.numInstances(); i++) {
        if (whichSubset(data.instance(i)) > -1) {
            insts.add(data.instance(i));
        }
    }
    Distribution newD = new Distribution(insts, this);
    newD.addInstWithUnknown(data, m_attIndex);
    m_distribution = newD;
}

From source file:j48.C45PruneableClassifierTreeG.java

License:Open Source License

/**
 * sorts/deletes instances into/from node and atbop according to 
 * the test for subset, then calls traverseTree for subset's node.
 *
 * @param fulldata all instances//  w ww.j  ava2s . c om
 * @param iindex array the tracks the weight of each instance in
 *        the atbop and at the leaf (0.0 if not present)
 * @param limits array specifying current upper/lower limits for numeric atts
 * @param subset the subset for which to sort instances into inode & iatbop
 */
private void sortInstances(Instances fulldata, double[][] iindex, double[][] limits, int subset)
        throws Exception {

    C45Split test = (C45Split) localModel();

    // update the instances index for subset
    double knownCases = 0;
    double thisSubsetCount = 0;
    for (int x = 0; x < iindex[0].length; x++) {
        if (iindex[0][x] == 0 && iindex[1][x] == 0) // skip "discarded" instances
            continue;
        if (!fulldata.instance(x).isMissing(test.attIndex())) {
            knownCases += iindex[0][x];
            if (test.whichSubset(fulldata.instance(x)) != subset) {
                if (iindex[0][x] > 0) {
                    // move to atbop, delete from leaf
                    iindex[1][x] = iindex[0][x];
                    iindex[0][x] = 0;
                } else {
                    if (iindex[1][x] > 0) {
                        // instance is now "discarded"
                        iindex[1][x] = 0;
                    }
                }
            } else {
                thisSubsetCount += iindex[0][x];
            }
        }
    }

    // work out proportions of weight for missing values for leaf and atbop
    double lprop = (knownCases == 0) ? (1.0 / (double) test.numSubsets())
            : (thisSubsetCount / (double) knownCases);

    // add in the instances that have missing value for attIndex 
    for (int x = 0; x < iindex[0].length; x++) {
        if (iindex[0][x] == 0 && iindex[1][x] == 0)
            continue; // skip "discarded" instances
        if (fulldata.instance(x).isMissing(test.attIndex())) {
            iindex[1][x] -= (iindex[1][x] - iindex[0][x]) * (1 - lprop);
            iindex[0][x] *= lprop;
        }
    }

    int nodeClass = localModel().distribution().maxClass(subset);
    double pL = (localModel().distribution().perClass(nodeClass) + 1.0)
            / (localModel().distribution().total() + 2.0);

    // call traerseTree method for the child node
    son(subset).traverseTree(fulldata, iindex, test.minsAndMaxs(fulldata, limits, subset), this, pL, nodeClass);
}

From source file:j48.C45PruneableClassifierTreeG.java

License:Open Source License

/**
 * finds new nodes that improve accuracy and grafts them onto the tree
 *
 * @param fulldata the instances in whole trainset
 * @param iindex records num tests each instance has failed up to this node
 * @param limits the upper/lower limits for numeric attributes
 * @param parent the node immediately before the current one
 * @param pLaplace laplace for leaf, calculated by parent (in case leaf empty)
 * @param pLeafClass class of leaf, determined by parent (in case leaf empty)
 *///from  ww  w. j a v  a2 s. c  o m
private void findGraft(Instances fulldata, double[][] iindex, double[][] limits, ClassifierTree parent,
        double pLaplace, int pLeafClass) throws Exception {

    // get the class for this leaf
    int leafClass = (m_isEmpty) ? pLeafClass : localModel().distribution().maxClass();

    // get the laplace value for this leaf
    double leafLaplace = (m_isEmpty) ? pLaplace : laplaceLeaf(leafClass);

    // sort the instances into those at the leaf, those in atbop, and discarded
    Instances l = new Instances(fulldata, fulldata.numInstances());
    Instances n = new Instances(fulldata, fulldata.numInstances());
    int lcount = 0;
    int acount = 0;
    for (int x = 0; x < fulldata.numInstances(); x++) {
        if (iindex[0][x] <= 0 && iindex[1][x] <= 0)
            continue;
        if (iindex[0][x] != 0) {
            l.add(fulldata.instance(x));
            l.instance(lcount).setWeight(iindex[0][x]);
            // move instance's weight in iindex to same index as in l
            iindex[0][lcount++] = iindex[0][x];
        }
        if (iindex[1][x] > 0) {
            n.add(fulldata.instance(x));
            n.instance(acount).setWeight(iindex[1][x]);
            // move instance's weight in iindex to same index as in n
            iindex[1][acount++] = iindex[1][x];
        }
    }

    boolean graftPossible = false;
    double[] classDist = new double[n.numClasses()];
    for (int x = 0; x < n.numInstances(); x++) {
        if (iindex[1][x] > 0 && !n.instance(x).classIsMissing())
            classDist[(int) n.instance(x).classValue()] += iindex[1][x];
    }

    for (int cVal = 0; cVal < n.numClasses(); cVal++) {
        double theLaplace = (classDist[cVal] + 1.0) / (classDist[cVal] + 2.0);
        if (cVal != leafClass && (theLaplace > leafLaplace)
                && (biprob(classDist[cVal], classDist[cVal], leafLaplace) > m_BiProbCrit)) {
            graftPossible = true;
            break;
        }
    }

    if (!graftPossible) {
        return;
    }

    // 1. Initialize to {} a set of tuples t containing potential tests
    ArrayList t = new ArrayList();

    // go through each attribute
    for (int a = 0; a < n.numAttributes(); a++) {
        if (a == n.classIndex())
            continue; // skip the class

        // sort instances in atbop by $a
        int[] sorted = sortByAttribute(n, a);

        // 2. For each continuous attribute $a:
        if (n.attribute(a).isNumeric()) {

            // find min and max values for this attribute at the leaf
            boolean prohibited = false;
            double minLeaf = Double.POSITIVE_INFINITY;
            double maxLeaf = Double.NEGATIVE_INFINITY;
            for (int i = 0; i < l.numInstances(); i++) {
                if (l.instance(i).isMissing(a)) {
                    if (l.instance(i).classValue() == leafClass) {
                        prohibited = true;
                        break;
                    }
                }
                double value = l.instance(i).value(a);
                if (!m_relabel || l.instance(i).classValue() == leafClass) {
                    if (value < minLeaf)
                        minLeaf = value;
                    if (value > maxLeaf)
                        maxLeaf = value;
                }
            }
            if (prohibited) {
                continue;
            }

            // (a) find values of
            //    $n: instances in atbop (already have that, actually)
            //    $v: a value for $a that exists for a case in the atbop, where
            //       $v is < the min value for $a for a case at the leaf which
            //       has the class $c, and $v is > the lowerlimit of $a at
            //       the leaf.
            //       (note: error in original paper stated that $v must be
            //       smaller OR EQUAL TO the min value).
            //    $k: $k is a class
            //  that maximize L' = Laplace({$x: $x contained in cases($n)
            //    & value($a,$x) <= $v & value($a,$x) > lowerlim($l,$a)}, $k).
            double minBestClass = Double.NaN;
            double minBestLaplace = leafLaplace;
            double minBestVal = Double.NaN;
            double minBestPos = Double.NaN;
            double minBestTotal = Double.NaN;
            double[][] minBestCounts = null;
            double[][] counts = new double[2][n.numClasses()];
            for (int x = 0; x < n.numInstances(); x++) {
                if (n.instance(sorted[x]).isMissing(a))
                    break; // missing are sorted to end: no more valid vals

                double theval = n.instance(sorted[x]).value(a);
                if (m_Debug)
                    System.out.println("\t " + theval);

                if (theval <= limits[a][0]) {
                    if (m_Debug)
                        System.out.println("\t  <= lowerlim: continuing...");
                    continue;
                }
                // note: error in paper would have this read "theVal > minLeaf)
                if (theval >= minLeaf) {
                    if (m_Debug)
                        System.out.println("\t  >= minLeaf; breaking...");
                    break;
                }
                counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];

                if (x != n.numInstances() - 1) {
                    int z = x + 1;
                    while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) {
                        z++;
                        x++;
                        counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];
                    }
                }

                // work out the best laplace/class (for <= theval)
                double total = Utils.sum(counts[0]);
                for (int c = 0; c < n.numClasses(); c++) {
                    double temp = (counts[0][c] + 1.0) / (total + 2.0);
                    if (temp > minBestLaplace) {
                        minBestPos = counts[0][c];
                        minBestTotal = total;
                        minBestLaplace = temp;
                        minBestClass = c;
                        minBestCounts = copyCounts(counts);

                        minBestVal = (x == n.numInstances() - 1) ? theval
                                : ((theval + n.instance(sorted[x + 1]).value(a)) / 2.0);
                    }
                }
            }

            // (b) add to t tuple <n,a,v,k,L',"<=">
            if (!Double.isNaN(minBestVal) && biprob(minBestPos, minBestTotal, leafLaplace) > m_BiProbCrit) {
                GraftSplit gsplit = null;
                try {
                    gsplit = new GraftSplit(a, minBestVal, 0, leafClass, minBestCounts);
                } catch (Exception e) {
                    System.err.println("graftsplit error: " + e.getMessage());
                    System.exit(1);
                }
                t.add(gsplit);
            }
            // free space
            minBestCounts = null;

            // (c) find values of
            //    n: instances in atbop (already have that, actually)
            //    $v: a value for $a that exists for a case in the atbop, where
            //       $v is > the max value for $a for a case at the leaf which
            //       has the class $c, and $v is <= the upperlimit of $a at
            //       the leaf.
            //    k: k is a class
            //   that maximize L' = Laplace({x: x contained in cases(n)
            //       & value(a,x) > v & value(a,x) <= upperlim(l,a)}, k).
            double maxBestClass = -1;
            double maxBestLaplace = leafLaplace;
            double maxBestVal = Double.NaN;
            double maxBestPos = Double.NaN;
            double maxBestTotal = Double.NaN;
            double[][] maxBestCounts = null;
            for (int c = 0; c < n.numClasses(); c++) { // zero the counts
                counts[0][c] = 0;
                counts[1][c] = 0; // shouldn't need to do this ...
            }

            // check smallest val for a in atbop is < upper limit
            if (n.numInstances() >= 1 && n.instance(sorted[0]).value(a) < limits[a][1]) {
                for (int x = n.numInstances() - 1; x >= 0; x--) {
                    if (n.instance(sorted[x]).isMissing(a))
                        continue;

                    double theval = n.instance(sorted[x]).value(a);
                    if (m_Debug)
                        System.out.println("\t " + theval);

                    if (theval > limits[a][1]) {
                        if (m_Debug)
                            System.out.println("\t  >= upperlim; continuing...");
                        continue;
                    }
                    if (theval <= maxLeaf) {
                        if (m_Debug)
                            System.out.println("\t  < maxLeaf; breaking...");
                        break;
                    }

                    // increment counts
                    counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];

                    if (x != 0 && !n.instance(sorted[x - 1]).isMissing(a)) {
                        int z = x - 1;
                        while (z >= 0 && n.instance(sorted[z]).value(a) == theval) {
                            z--;
                            x--;
                            counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];
                        }
                    }

                    // work out best laplace for > theval
                    double total = Utils.sum(counts[1]);
                    for (int c = 0; c < n.numClasses(); c++) {
                        double temp = (counts[1][c] + 1.0) / (total + 2.0);
                        if (temp > maxBestLaplace) {
                            maxBestPos = counts[1][c];
                            maxBestTotal = total;
                            maxBestLaplace = temp;
                            maxBestClass = c;
                            maxBestCounts = copyCounts(counts);
                            maxBestVal = (x == 0) ? theval
                                    : ((theval + n.instance(sorted[x - 1]).value(a)) / 2.0);
                        }
                    }
                }

                // (d) add to t tuple <n,a,v,k,L',">">
                if (!Double.isNaN(maxBestVal) && biprob(maxBestPos, maxBestTotal, leafLaplace) > m_BiProbCrit) {
                    GraftSplit gsplit = null;
                    try {
                        gsplit = new GraftSplit(a, maxBestVal, 1, leafClass, maxBestCounts);
                    } catch (Exception e) {
                        System.err.println("graftsplit error:" + e.getMessage());
                        System.exit(1);
                    }
                    t.add(gsplit);
                }
            }
        } else { // must be a nominal attribute

            // 3. for each discrete attribute a for which there is no
            //    test at an ancestor of l

            // skip if this attribute has already been used
            if (limits[a][1] == 1) {
                continue;
            }

            boolean[] prohibit = new boolean[l.attribute(a).numValues()];
            for (int aval = 0; aval < n.attribute(a).numValues(); aval++) {
                for (int x = 0; x < l.numInstances(); x++) {
                    if ((l.instance(x).isMissing(a) || l.instance(x).value(a) == aval)
                            && (!m_relabel || (l.instance(x).classValue() == leafClass))) {
                        prohibit[aval] = true;
                        break;
                    }
                }
            }

            // (a) find values of
            //       $n: instances in atbop (already have that, actually)
            //       $v: $v is a value for $a
            //       $k: $k is a class
            //     that maximize L' = Laplace({$x: $x contained in cases($n)
            //           & value($a,$x) = $v}, $k).
            double bestVal = Double.NaN;
            double bestClass = Double.NaN;
            double bestLaplace = leafLaplace;
            double[][] bestCounts = null;
            double[][] counts = new double[2][n.numClasses()];

            for (int x = 0; x < n.numInstances(); x++) {
                if (n.instance(sorted[x]).isMissing(a))
                    continue;

                // zero the counts
                for (int c = 0; c < n.numClasses(); c++)
                    counts[0][c] = 0;

                double theval = n.instance(sorted[x]).value(a);
                counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];

                if (x != n.numInstances() - 1) {
                    int z = x + 1;
                    while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) {
                        z++;
                        x++;
                        counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];
                    }
                }

                if (!prohibit[(int) theval]) {
                    // work out best laplace for > theval
                    double total = Utils.sum(counts[0]);
                    bestLaplace = leafLaplace;
                    bestClass = Double.NaN;
                    for (int c = 0; c < n.numClasses(); c++) {
                        double temp = (counts[0][c] + 1.0) / (total + 2.0);
                        if (temp > bestLaplace && biprob(counts[0][c], total, leafLaplace) > m_BiProbCrit) {
                            bestLaplace = temp;
                            bestClass = c;
                            bestVal = theval;
                            bestCounts = copyCounts(counts);
                        }
                    }
                    // add to graft list
                    if (!Double.isNaN(bestClass)) {
                        GraftSplit gsplit = null;
                        try {
                            gsplit = new GraftSplit(a, bestVal, 2, leafClass, bestCounts);
                        } catch (Exception e) {
                            System.err.println("graftsplit error: " + e.getMessage());
                            System.exit(1);
                        }
                        t.add(gsplit);
                    }
                }
            }
            // (b) add to t tuple <n,a,v,k,L',"=">
            // done this already
        }
    }

    // 4. remove from t all tuples <n,a,v,c,L,x> such that L <=
    //    Laplace(cases(l),c) or prob(x,n,Laplace(cases(l),c) <= 0.05
    //      -- checked this constraint prior to adding a tuple --

    // *** step six done before step five for efficiency ***
    // 6. for each <n,a,v,k,L,x> in t ordered on L from highest to lowest
    // order the tuples from highest to lowest laplace
    // (this actually orders lowest to highest)
    Collections.sort(t);

    // 5. remove from t all tuples <n,a,v,c,L,x> such that there is
    //    no tuple <n',a',v',k',L',x'> such that k' != c & L' < L.
    for (int x = 0; x < t.size(); x++) {
        GraftSplit gs = (GraftSplit) t.get(x);
        if (gs.maxClassForSubsetOfInterest() != leafClass) {
            break; // reached a graft with class != leafClass, so stop deleting
        } else {
            t.remove(x);
            x--;
        }
    }

    // if no potential grafts were found, do nothing and return
    if (t.size() < 1) {
        return;
    }

    // create the distributions for each graft
    for (int x = t.size() - 1; x >= 0; x--) {
        GraftSplit gs = (GraftSplit) t.get(x);
        try {
            gs.buildClassifier(l);
            gs.deleteGraftedCases(l); // so they don't go down the other branch
        } catch (Exception e) {
            System.err.println("graftsplit build error: " + e.getMessage());
        }
    }

    // add this stuff to the tree
    ((C45PruneableClassifierTreeG) parent).setDescendents(t, this);
}

From source file:j48.C45Split.java

License:Open Source License

/**
 * Creates split on numeric attribute./*from  w  w w  .ja v a 2s . c o  m*/
 * 
 * @exception Exception
 *                if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (instance.isMissing(m_attIndex))
            break;
        m_distribution.add(1, instance);
        i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj))
        minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25))
        minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit))
        return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

        if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next)
                .value(m_attIndex)) {

            // Move class values for all Instances up to next
            // possible split point.
            m_distribution.shiftRange(1, 0, trainInstances, last, next);

            // Check if enough Instances in each subset and compute
            // values for criteria.
            if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
                    && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
                currentInfoGain = infoGainCrit.splitCritValue1(m_distribution, m_sumOfWeights, defaultEnt,
                        rrrrr);
                if (Utils.gr(currentInfoGain, m_infoGain)) {
                    m_infoGain = currentInfoGain;
                    splitIndex = next - 1;
                }
                m_index++;
            }
            last = next;
        }
        next++;
    }

    // Was there any useful split?
    if (m_index == 0)
        return;

    // Compute modified information gain for best split.
    m_infoGain = m_infoGain - (Utils.log2(m_index) / m_sumOfWeights);
    if (Utils.smOrEq(m_infoGain, 0))
        return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex)
            + trainInstances.instance(splitIndex).value(m_attIndex)) / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
        m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = gainRatioCrit.splitCritValue1(m_distribution, m_sumOfWeights, m_infoGain, lllll);
}

From source file:j48.Distribution.java

License:Open Source License

/**
 * Adds all instances in given range to given bag.
 *
 * @exception Exception if something goes wrong
 *//*from  w  ww  .j ava  2  s .c o  m*/
public final void addRange(int bagIndex, Instances source, int startIndex, int lastPlusOne) throws Exception {

    double sumOfWeights = 0;
    int classIndex;
    Instance instance;
    int i;

    for (i = startIndex; i < lastPlusOne; i++) {
        instance = (Instance) source.instance(i);
        classIndex = (int) instance.classValue();
        sumOfWeights = sumOfWeights + instance.weight();
        m_perClassPerBag[bagIndex][classIndex] += instance.weight();
        m_perClass[classIndex] += instance.weight();
    }
    m_perBag[bagIndex] += sumOfWeights;
    totaL += sumOfWeights;
}

From source file:j48.Distribution.java

License:Open Source License

/**
 * Deletes all instances in given range from given bag.
 *
 * @exception Exception if something goes wrong
 *///from  www .  j av  a2s  .co m
public final void delRange(int bagIndex, Instances source, int startIndex, int lastPlusOne) throws Exception {

    double sumOfWeights = 0;
    int classIndex;
    Instance instance;
    int i;

    for (i = startIndex; i < lastPlusOne; i++) {
        instance = (Instance) source.instance(i);
        classIndex = (int) instance.classValue();
        sumOfWeights = sumOfWeights + instance.weight();
        m_perClassPerBag[bagIndex][classIndex] -= instance.weight();
        m_perClass[classIndex] -= instance.weight();
    }
    m_perBag[bagIndex] -= sumOfWeights;
    totaL -= sumOfWeights;
}

From source file:j48.Distribution.java

License:Open Source License

/**
 * Shifts all instances in given range from one bag to another one.
 *
 * @exception Exception if something goes wrong
 *//*w w w .  j  a  v a  2 s .  c om*/
public final void shiftRange(int from, int to, Instances source, int startIndex, int lastPlusOne)
        throws Exception {

    int classIndex;
    double weight;
    Instance instance;
    int i;

    for (i = startIndex; i < lastPlusOne; i++) {
        instance = (Instance) source.instance(i);
        classIndex = (int) instance.classValue();
        weight = instance.weight();
        m_perClassPerBag[from][classIndex] -= weight;
        m_perClassPerBag[to][classIndex] += weight;
        m_perBag[from] -= weight;
        m_perBag[to] += weight;
    }
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * deletes the cases in data that belong to leaf pointed to by
 * the test (i.e. the subset of interest).  this is useful so
 * the instances belonging to that leaf aren't passed down the
 * other branch./*from   ww w.  j  a va  2s. c om*/
 *
 * @param data the instances to delete from
 */
public void deleteGraftedCases(Instances data) {

    int subOfInterest = subsetOfInterest();
    for (int x = 0; x < data.numInstances(); x++) {
        if (whichSubset(data.instance(x)) == subOfInterest) {
            data.delete(x--);
        }
    }
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * builds m_graftdistro using the passed data
 *
 * @param data the instances to use when creating the distribution
 *//*from   w  w w .  j av  a  2  s .  co m*/
public void buildClassifier(Instances data) throws Exception {

    // distribution for the graft, not counting cases in atbop, only orig leaf
    m_graftdistro = new Distribution(2, data.numClasses());

    // which subset are we looking at for the graft?
    int subset = subsetOfInterest(); // this is the subset for m_leaf

    double thisNodeCount = 0;
    double knownCases = 0;
    boolean allKnown = true;
    // populate distribution
    for (int x = 0; x < data.numInstances(); x++) {
        Instance instance = data.instance(x);
        if (instance.isMissing(m_attIndex)) {
            allKnown = false;
            continue;
        }
        knownCases += instance.weight();
        int subst = whichSubset(instance);
        if (subst == -1)
            continue;
        m_graftdistro.add(subst, instance);
        if (subst == subset) { // instance belongs at m_leaf
            thisNodeCount += instance.weight();
        }
    }
    double factor = (knownCases == 0) ? (1.0 / (double) 2.0) : (thisNodeCount / knownCases);
    if (!allKnown) {
        for (int x = 0; x < data.numInstances(); x++) {
            if (data.instance(x).isMissing(m_attIndex)) {
                Instance instance = data.instance(x);
                int subst = whichSubset(instance);
                if (subst == -1)
                    continue;
                instance.setWeight(instance.weight() * factor);
                m_graftdistro.add(subst, instance);
            }
        }
    }

    // if there are no cases at the leaf, make sure the desired
    // class is chosen, by setting counts to 0.01
    if (m_graftdistro.perBag(subset) == 0) {
        double[] counts = new double[data.numClasses()];
        counts[m_maxClass] = 0.01;
        m_graftdistro.add(subset, counts);
    }
    if (m_graftdistro.perBag((subset == 0) ? 1 : 0) == 0) {
        double[] counts = new double[data.numClasses()];
        counts[(int) m_otherLeafMaxClass] = 0.01;
        m_graftdistro.add((subset == 0) ? 1 : 0, counts);
    }
}

From source file:j48.NBTreeNoSplit.java

License:Open Source License

/**
 * Utility method for fast 5-fold cross validation of a naive bayes
 * model//from w  ww . j  a v a 2  s. c o  m
 *
 * @param fullModel a <code>NaiveBayesUpdateable</code> value
 * @param trainingSet an <code>Instances</code> value
 * @param r a <code>Random</code> value
 * @return a <code>double</code> value
 * @exception Exception if an error occurs
 */
public static double crossValidate(NaiveBayesUpdateable fullModel, Instances trainingSet, Random r)
        throws Exception {
    // make some copies for fast evaluation of 5-fold xval
    Classifier[] copies = Classifier.makeCopies(fullModel, 5);
    Evaluation eval = new Evaluation(trainingSet);
    // make some splits
    for (int j = 0; j < 5; j++) {
        Instances test = trainingSet.testCV(5, j);
        // unlearn these test instances
        for (int k = 0; k < test.numInstances(); k++) {
            test.instance(k).setWeight(-test.instance(k).weight());
            ((NaiveBayesUpdateable) copies[j]).updateClassifier(test.instance(k));
            // reset the weight back to its original value
            test.instance(k).setWeight(-test.instance(k).weight());
        }
        eval.evaluateModel(copies[j], test);
    }
    return eval.incorrect();
}