List of usage examples for weka.core Instances instance
publicInstance instance(int index)
From source file:j48.BinC45Split.java
License:Open Source License
/** * Sets distribution associated with model. *//*from w w w . ja va2 s. c o m*/ public void resetDistribution(Instances data) throws Exception { Instances insts = new Instances(data, data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { if (whichSubset(data.instance(i)) > -1) { insts.add(data.instance(i)); } } Distribution newD = new Distribution(insts, this); newD.addInstWithUnknown(data, m_attIndex); m_distribution = newD; }
From source file:j48.C45PruneableClassifierTreeG.java
License:Open Source License
/** * sorts/deletes instances into/from node and atbop according to * the test for subset, then calls traverseTree for subset's node. * * @param fulldata all instances// w ww.j ava2s . c om * @param iindex array the tracks the weight of each instance in * the atbop and at the leaf (0.0 if not present) * @param limits array specifying current upper/lower limits for numeric atts * @param subset the subset for which to sort instances into inode & iatbop */ private void sortInstances(Instances fulldata, double[][] iindex, double[][] limits, int subset) throws Exception { C45Split test = (C45Split) localModel(); // update the instances index for subset double knownCases = 0; double thisSubsetCount = 0; for (int x = 0; x < iindex[0].length; x++) { if (iindex[0][x] == 0 && iindex[1][x] == 0) // skip "discarded" instances continue; if (!fulldata.instance(x).isMissing(test.attIndex())) { knownCases += iindex[0][x]; if (test.whichSubset(fulldata.instance(x)) != subset) { if (iindex[0][x] > 0) { // move to atbop, delete from leaf iindex[1][x] = iindex[0][x]; iindex[0][x] = 0; } else { if (iindex[1][x] > 0) { // instance is now "discarded" iindex[1][x] = 0; } } } else { thisSubsetCount += iindex[0][x]; } } } // work out proportions of weight for missing values for leaf and atbop double lprop = (knownCases == 0) ? (1.0 / (double) test.numSubsets()) : (thisSubsetCount / (double) knownCases); // add in the instances that have missing value for attIndex for (int x = 0; x < iindex[0].length; x++) { if (iindex[0][x] == 0 && iindex[1][x] == 0) continue; // skip "discarded" instances if (fulldata.instance(x).isMissing(test.attIndex())) { iindex[1][x] -= (iindex[1][x] - iindex[0][x]) * (1 - lprop); iindex[0][x] *= lprop; } } int nodeClass = localModel().distribution().maxClass(subset); double pL = (localModel().distribution().perClass(nodeClass) + 1.0) / (localModel().distribution().total() + 2.0); // call traerseTree method for the child node son(subset).traverseTree(fulldata, iindex, test.minsAndMaxs(fulldata, limits, subset), this, pL, nodeClass); }
From source file:j48.C45PruneableClassifierTreeG.java
License:Open Source License
/** * finds new nodes that improve accuracy and grafts them onto the tree * * @param fulldata the instances in whole trainset * @param iindex records num tests each instance has failed up to this node * @param limits the upper/lower limits for numeric attributes * @param parent the node immediately before the current one * @param pLaplace laplace for leaf, calculated by parent (in case leaf empty) * @param pLeafClass class of leaf, determined by parent (in case leaf empty) *///from ww w. j a v a2 s. c o m private void findGraft(Instances fulldata, double[][] iindex, double[][] limits, ClassifierTree parent, double pLaplace, int pLeafClass) throws Exception { // get the class for this leaf int leafClass = (m_isEmpty) ? pLeafClass : localModel().distribution().maxClass(); // get the laplace value for this leaf double leafLaplace = (m_isEmpty) ? pLaplace : laplaceLeaf(leafClass); // sort the instances into those at the leaf, those in atbop, and discarded Instances l = new Instances(fulldata, fulldata.numInstances()); Instances n = new Instances(fulldata, fulldata.numInstances()); int lcount = 0; int acount = 0; for (int x = 0; x < fulldata.numInstances(); x++) { if (iindex[0][x] <= 0 && iindex[1][x] <= 0) continue; if (iindex[0][x] != 0) { l.add(fulldata.instance(x)); l.instance(lcount).setWeight(iindex[0][x]); // move instance's weight in iindex to same index as in l iindex[0][lcount++] = iindex[0][x]; } if (iindex[1][x] > 0) { n.add(fulldata.instance(x)); n.instance(acount).setWeight(iindex[1][x]); // move instance's weight in iindex to same index as in n iindex[1][acount++] = iindex[1][x]; } } boolean graftPossible = false; double[] classDist = new double[n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (iindex[1][x] > 0 && !n.instance(x).classIsMissing()) classDist[(int) n.instance(x).classValue()] += iindex[1][x]; } for (int cVal = 0; cVal < n.numClasses(); cVal++) { double theLaplace = (classDist[cVal] + 1.0) / (classDist[cVal] + 2.0); if (cVal != leafClass && (theLaplace > leafLaplace) && (biprob(classDist[cVal], classDist[cVal], leafLaplace) > m_BiProbCrit)) { graftPossible = true; break; } } if (!graftPossible) { return; } // 1. Initialize to {} a set of tuples t containing potential tests ArrayList t = new ArrayList(); // go through each attribute for (int a = 0; a < n.numAttributes(); a++) { if (a == n.classIndex()) continue; // skip the class // sort instances in atbop by $a int[] sorted = sortByAttribute(n, a); // 2. For each continuous attribute $a: if (n.attribute(a).isNumeric()) { // find min and max values for this attribute at the leaf boolean prohibited = false; double minLeaf = Double.POSITIVE_INFINITY; double maxLeaf = Double.NEGATIVE_INFINITY; for (int i = 0; i < l.numInstances(); i++) { if (l.instance(i).isMissing(a)) { if (l.instance(i).classValue() == leafClass) { prohibited = true; break; } } double value = l.instance(i).value(a); if (!m_relabel || l.instance(i).classValue() == leafClass) { if (value < minLeaf) minLeaf = value; if (value > maxLeaf) maxLeaf = value; } } if (prohibited) { continue; } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is < the min value for $a for a case at the leaf which // has the class $c, and $v is > the lowerlimit of $a at // the leaf. // (note: error in original paper stated that $v must be // smaller OR EQUAL TO the min value). // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) <= $v & value($a,$x) > lowerlim($l,$a)}, $k). double minBestClass = Double.NaN; double minBestLaplace = leafLaplace; double minBestVal = Double.NaN; double minBestPos = Double.NaN; double minBestTotal = Double.NaN; double[][] minBestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) break; // missing are sorted to end: no more valid vals double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval <= limits[a][0]) { if (m_Debug) System.out.println("\t <= lowerlim: continuing..."); continue; } // note: error in paper would have this read "theVal > minLeaf) if (theval >= minLeaf) { if (m_Debug) System.out.println("\t >= minLeaf; breaking..."); break; } counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out the best laplace/class (for <= theval) double total = Utils.sum(counts[0]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > minBestLaplace) { minBestPos = counts[0][c]; minBestTotal = total; minBestLaplace = temp; minBestClass = c; minBestCounts = copyCounts(counts); minBestVal = (x == n.numInstances() - 1) ? theval : ((theval + n.instance(sorted[x + 1]).value(a)) / 2.0); } } } // (b) add to t tuple <n,a,v,k,L',"<="> if (!Double.isNaN(minBestVal) && biprob(minBestPos, minBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, minBestVal, 0, leafClass, minBestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } // free space minBestCounts = null; // (c) find values of // n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is > the max value for $a for a case at the leaf which // has the class $c, and $v is <= the upperlimit of $a at // the leaf. // k: k is a class // that maximize L' = Laplace({x: x contained in cases(n) // & value(a,x) > v & value(a,x) <= upperlim(l,a)}, k). double maxBestClass = -1; double maxBestLaplace = leafLaplace; double maxBestVal = Double.NaN; double maxBestPos = Double.NaN; double maxBestTotal = Double.NaN; double[][] maxBestCounts = null; for (int c = 0; c < n.numClasses(); c++) { // zero the counts counts[0][c] = 0; counts[1][c] = 0; // shouldn't need to do this ... } // check smallest val for a in atbop is < upper limit if (n.numInstances() >= 1 && n.instance(sorted[0]).value(a) < limits[a][1]) { for (int x = n.numInstances() - 1; x >= 0; x--) { if (n.instance(sorted[x]).isMissing(a)) continue; double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval > limits[a][1]) { if (m_Debug) System.out.println("\t >= upperlim; continuing..."); continue; } if (theval <= maxLeaf) { if (m_Debug) System.out.println("\t < maxLeaf; breaking..."); break; } // increment counts counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != 0 && !n.instance(sorted[x - 1]).isMissing(a)) { int z = x - 1; while (z >= 0 && n.instance(sorted[z]).value(a) == theval) { z--; x--; counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out best laplace for > theval double total = Utils.sum(counts[1]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[1][c] + 1.0) / (total + 2.0); if (temp > maxBestLaplace) { maxBestPos = counts[1][c]; maxBestTotal = total; maxBestLaplace = temp; maxBestClass = c; maxBestCounts = copyCounts(counts); maxBestVal = (x == 0) ? theval : ((theval + n.instance(sorted[x - 1]).value(a)) / 2.0); } } } // (d) add to t tuple <n,a,v,k,L',">"> if (!Double.isNaN(maxBestVal) && biprob(maxBestPos, maxBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, maxBestVal, 1, leafClass, maxBestCounts); } catch (Exception e) { System.err.println("graftsplit error:" + e.getMessage()); System.exit(1); } t.add(gsplit); } } } else { // must be a nominal attribute // 3. for each discrete attribute a for which there is no // test at an ancestor of l // skip if this attribute has already been used if (limits[a][1] == 1) { continue; } boolean[] prohibit = new boolean[l.attribute(a).numValues()]; for (int aval = 0; aval < n.attribute(a).numValues(); aval++) { for (int x = 0; x < l.numInstances(); x++) { if ((l.instance(x).isMissing(a) || l.instance(x).value(a) == aval) && (!m_relabel || (l.instance(x).classValue() == leafClass))) { prohibit[aval] = true; break; } } } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: $v is a value for $a // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) = $v}, $k). double bestVal = Double.NaN; double bestClass = Double.NaN; double bestLaplace = leafLaplace; double[][] bestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) continue; // zero the counts for (int c = 0; c < n.numClasses(); c++) counts[0][c] = 0; double theval = n.instance(sorted[x]).value(a); counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } if (!prohibit[(int) theval]) { // work out best laplace for > theval double total = Utils.sum(counts[0]); bestLaplace = leafLaplace; bestClass = Double.NaN; for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > bestLaplace && biprob(counts[0][c], total, leafLaplace) > m_BiProbCrit) { bestLaplace = temp; bestClass = c; bestVal = theval; bestCounts = copyCounts(counts); } } // add to graft list if (!Double.isNaN(bestClass)) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, bestVal, 2, leafClass, bestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } } } // (b) add to t tuple <n,a,v,k,L',"="> // done this already } } // 4. remove from t all tuples <n,a,v,c,L,x> such that L <= // Laplace(cases(l),c) or prob(x,n,Laplace(cases(l),c) <= 0.05 // -- checked this constraint prior to adding a tuple -- // *** step six done before step five for efficiency *** // 6. for each <n,a,v,k,L,x> in t ordered on L from highest to lowest // order the tuples from highest to lowest laplace // (this actually orders lowest to highest) Collections.sort(t); // 5. remove from t all tuples <n,a,v,c,L,x> such that there is // no tuple <n',a',v',k',L',x'> such that k' != c & L' < L. for (int x = 0; x < t.size(); x++) { GraftSplit gs = (GraftSplit) t.get(x); if (gs.maxClassForSubsetOfInterest() != leafClass) { break; // reached a graft with class != leafClass, so stop deleting } else { t.remove(x); x--; } } // if no potential grafts were found, do nothing and return if (t.size() < 1) { return; } // create the distributions for each graft for (int x = t.size() - 1; x >= 0; x--) { GraftSplit gs = (GraftSplit) t.get(x); try { gs.buildClassifier(l); gs.deleteGraftedCases(l); // so they don't go down the other branch } catch (Exception e) { System.err.println("graftsplit build error: " + e.getMessage()); } } // add this stuff to the tree ((C45PruneableClassifierTreeG) parent).setDescendents(t, this); }
From source file:j48.C45Split.java
License:Open Source License
/** * Creates split on numeric attribute./*from w w w .ja v a 2s . c o m*/ * * @exception Exception * if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { int firstMiss; int next = 1; int last = 0; int splitIndex = -1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); i = 0; while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return; // Compute values of criteria for all possible split // indices. defaultEnt = infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next) .value(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = infoGainCrit.splitCritValue1(m_distribution, m_sumOfWeights, defaultEnt, rrrrr); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } m_index++; } last = next; } next++; } // Was there any useful split? if (m_index == 0) return; // Compute modified information gain for best split. m_infoGain = m_infoGain - (Utils.log2(m_index) / m_sumOfWeights); if (Utils.smOrEq(m_infoGain, 0)) return; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex) + trainInstances.instance(splitIndex).value(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = gainRatioCrit.splitCritValue1(m_distribution, m_sumOfWeights, m_infoGain, lllll); }
From source file:j48.Distribution.java
License:Open Source License
/** * Adds all instances in given range to given bag. * * @exception Exception if something goes wrong *//*from w ww .j ava 2 s .c o m*/ public final void addRange(int bagIndex, Instances source, int startIndex, int lastPlusOne) throws Exception { double sumOfWeights = 0; int classIndex; Instance instance; int i; for (i = startIndex; i < lastPlusOne; i++) { instance = (Instance) source.instance(i); classIndex = (int) instance.classValue(); sumOfWeights = sumOfWeights + instance.weight(); m_perClassPerBag[bagIndex][classIndex] += instance.weight(); m_perClass[classIndex] += instance.weight(); } m_perBag[bagIndex] += sumOfWeights; totaL += sumOfWeights; }
From source file:j48.Distribution.java
License:Open Source License
/** * Deletes all instances in given range from given bag. * * @exception Exception if something goes wrong *///from www . j av a2s .co m public final void delRange(int bagIndex, Instances source, int startIndex, int lastPlusOne) throws Exception { double sumOfWeights = 0; int classIndex; Instance instance; int i; for (i = startIndex; i < lastPlusOne; i++) { instance = (Instance) source.instance(i); classIndex = (int) instance.classValue(); sumOfWeights = sumOfWeights + instance.weight(); m_perClassPerBag[bagIndex][classIndex] -= instance.weight(); m_perClass[classIndex] -= instance.weight(); } m_perBag[bagIndex] -= sumOfWeights; totaL -= sumOfWeights; }
From source file:j48.Distribution.java
License:Open Source License
/** * Shifts all instances in given range from one bag to another one. * * @exception Exception if something goes wrong *//*w w w . j a v a 2 s . c om*/ public final void shiftRange(int from, int to, Instances source, int startIndex, int lastPlusOne) throws Exception { int classIndex; double weight; Instance instance; int i; for (i = startIndex; i < lastPlusOne; i++) { instance = (Instance) source.instance(i); classIndex = (int) instance.classValue(); weight = instance.weight(); m_perClassPerBag[from][classIndex] -= weight; m_perClassPerBag[to][classIndex] += weight; m_perBag[from] -= weight; m_perBag[to] += weight; } }
From source file:j48.GraftSplit.java
License:Open Source License
/** * deletes the cases in data that belong to leaf pointed to by * the test (i.e. the subset of interest). this is useful so * the instances belonging to that leaf aren't passed down the * other branch./*from ww w. j a va 2s. c om*/ * * @param data the instances to delete from */ public void deleteGraftedCases(Instances data) { int subOfInterest = subsetOfInterest(); for (int x = 0; x < data.numInstances(); x++) { if (whichSubset(data.instance(x)) == subOfInterest) { data.delete(x--); } } }
From source file:j48.GraftSplit.java
License:Open Source License
/** * builds m_graftdistro using the passed data * * @param data the instances to use when creating the distribution *//*from w w w . j av a 2 s . co m*/ public void buildClassifier(Instances data) throws Exception { // distribution for the graft, not counting cases in atbop, only orig leaf m_graftdistro = new Distribution(2, data.numClasses()); // which subset are we looking at for the graft? int subset = subsetOfInterest(); // this is the subset for m_leaf double thisNodeCount = 0; double knownCases = 0; boolean allKnown = true; // populate distribution for (int x = 0; x < data.numInstances(); x++) { Instance instance = data.instance(x); if (instance.isMissing(m_attIndex)) { allKnown = false; continue; } knownCases += instance.weight(); int subst = whichSubset(instance); if (subst == -1) continue; m_graftdistro.add(subst, instance); if (subst == subset) { // instance belongs at m_leaf thisNodeCount += instance.weight(); } } double factor = (knownCases == 0) ? (1.0 / (double) 2.0) : (thisNodeCount / knownCases); if (!allKnown) { for (int x = 0; x < data.numInstances(); x++) { if (data.instance(x).isMissing(m_attIndex)) { Instance instance = data.instance(x); int subst = whichSubset(instance); if (subst == -1) continue; instance.setWeight(instance.weight() * factor); m_graftdistro.add(subst, instance); } } } // if there are no cases at the leaf, make sure the desired // class is chosen, by setting counts to 0.01 if (m_graftdistro.perBag(subset) == 0) { double[] counts = new double[data.numClasses()]; counts[m_maxClass] = 0.01; m_graftdistro.add(subset, counts); } if (m_graftdistro.perBag((subset == 0) ? 1 : 0) == 0) { double[] counts = new double[data.numClasses()]; counts[(int) m_otherLeafMaxClass] = 0.01; m_graftdistro.add((subset == 0) ? 1 : 0, counts); } }
From source file:j48.NBTreeNoSplit.java
License:Open Source License
/** * Utility method for fast 5-fold cross validation of a naive bayes * model//from w ww . j a v a 2 s. c o m * * @param fullModel a <code>NaiveBayesUpdateable</code> value * @param trainingSet an <code>Instances</code> value * @param r a <code>Random</code> value * @return a <code>double</code> value * @exception Exception if an error occurs */ public static double crossValidate(NaiveBayesUpdateable fullModel, Instances trainingSet, Random r) throws Exception { // make some copies for fast evaluation of 5-fold xval Classifier[] copies = Classifier.makeCopies(fullModel, 5); Evaluation eval = new Evaluation(trainingSet); // make some splits for (int j = 0; j < 5; j++) { Instances test = trainingSet.testCV(5, j); // unlearn these test instances for (int k = 0; k < test.numInstances(); k++) { test.instance(k).setWeight(-test.instance(k).weight()); ((NaiveBayesUpdateable) copies[j]).updateClassifier(test.instance(k)); // reset the weight back to its original value test.instance(k).setWeight(-test.instance(k).weight()); } eval.evaluateModel(copies[j], test); } return eval.incorrect(); }