List of usage examples for weka.core Instance isMissing
public boolean isMissing(Attribute att);
From source file:boosting.classifiers.DecisionStumpWritable.java
License:Open Source License
/** * Finds best split for nominal attribute and nominal class * and returns value./*from w ww .j a v a 2 s . com*/ * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ private double findSplitNominalNominal(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal; double[][] counts = new double[m_Instances.attribute(index).numValues() + 1][m_Instances.numClasses()]; double[] sumCounts = new double[m_Instances.numClasses()]; double[][] bestDist = new double[3][m_Instances.numClasses()]; int numMissing = 0; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (inst.isMissing(index)) { numMissing++; counts[m_Instances.attribute(index).numValues()][(int) inst.classValue()] += inst.weight(); } else { counts[(int) inst.value(index)][(int) inst.classValue()] += inst.weight(); } } // Compute sum of counts for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { sumCounts[j] += counts[i][j]; } } // Make split counts for each possible split and evaluate System.arraycopy(counts[m_Instances.attribute(index).numValues()], 0, m_Distribution[2], 0, m_Instances.numClasses()); for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { m_Distribution[0][j] = counts[i][j]; m_Distribution[1][j] = sumCounts[j] - counts[i][j]; } currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution); if (currVal < bestVal) { bestVal = currVal; m_SplitPoint = (double) i; for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } } } // No missing values in training data. if (numMissing == 0) { System.arraycopy(sumCounts, 0, bestDist[2], 0, m_Instances.numClasses()); } m_Distribution = bestDist; return bestVal; }
From source file:boosting.classifiers.DecisionStumpWritable.java
License:Open Source License
/** * Finds best split for nominal attribute and numeric class * and returns value.//from w w w . ja v a2 s .co m * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ private double findSplitNominalNumeric(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal; double[] sumsSquaresPerValue = new double[m_Instances.attribute(index).numValues()], sumsPerValue = new double[m_Instances.attribute(index).numValues()], weightsPerValue = new double[m_Instances.attribute(index).numValues()]; double totalSumSquaresW = 0, totalSumW = 0, totalSumOfWeightsW = 0, totalSumOfWeights = 0, totalSum = 0; double[] sumsSquares = new double[3], sumOfWeights = new double[3]; double[][] bestDist = new double[3][1]; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (inst.isMissing(index)) { m_Distribution[2][0] += inst.classValue() * inst.weight(); sumsSquares[2] += inst.classValue() * inst.classValue() * inst.weight(); sumOfWeights[2] += inst.weight(); } else { weightsPerValue[(int) inst.value(index)] += inst.weight(); sumsPerValue[(int) inst.value(index)] += inst.classValue() * inst.weight(); sumsSquaresPerValue[(int) inst.value(index)] += inst.classValue() * inst.classValue() * inst.weight(); } totalSumOfWeights += inst.weight(); totalSum += inst.classValue() * inst.weight(); } // Check if the total weight is zero if (totalSumOfWeights <= 0) { return bestVal; } // Compute sum of counts without missing ones for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { totalSumOfWeightsW += weightsPerValue[i]; totalSumSquaresW += sumsSquaresPerValue[i]; totalSumW += sumsPerValue[i]; } // Make split counts for each possible split and evaluate for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { m_Distribution[0][0] = sumsPerValue[i]; sumsSquares[0] = sumsSquaresPerValue[i]; sumOfWeights[0] = weightsPerValue[i]; m_Distribution[1][0] = totalSumW - sumsPerValue[i]; sumsSquares[1] = totalSumSquaresW - sumsSquaresPerValue[i]; sumOfWeights[1] = totalSumOfWeightsW - weightsPerValue[i]; currVal = variance(m_Distribution, sumsSquares, sumOfWeights); if (currVal < bestVal) { bestVal = currVal; m_SplitPoint = (double) i; for (int j = 0; j < 3; j++) { if (sumOfWeights[j] > 0) { bestDist[j][0] = m_Distribution[j][0] / sumOfWeights[j]; } else { bestDist[j][0] = totalSum / totalSumOfWeights; } } } } m_Distribution = bestDist; return bestVal; }
From source file:boosting.classifiers.DecisionStumpWritable.java
License:Open Source License
/** * Finds best split for numeric attribute and nominal class * and returns value./*from w w w . ja va2 s . com*/ * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ private double findSplitNumericNominal(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal, currCutPoint; int numMissing = 0; double[] sum = new double[m_Instances.numClasses()]; double[][] bestDist = new double[3][m_Instances.numClasses()]; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (!inst.isMissing(index)) { m_Distribution[1][(int) inst.classValue()] += inst.weight(); } else { m_Distribution[2][(int) inst.classValue()] += inst.weight(); numMissing++; } } System.arraycopy(m_Distribution[1], 0, sum, 0, m_Instances.numClasses()); // Save current distribution as best distribution for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } // Sort instances m_Instances.sort(index); // Make split counts for each possible split and evaluate for (int i = 0; i < m_Instances.numInstances() - (numMissing + 1); i++) { Instance inst = m_Instances.instance(i); Instance instPlusOne = m_Instances.instance(i + 1); m_Distribution[0][(int) inst.classValue()] += inst.weight(); m_Distribution[1][(int) inst.classValue()] -= inst.weight(); if (inst.value(index) < instPlusOne.value(index)) { currCutPoint = (inst.value(index) + instPlusOne.value(index)) / 2.0; currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution); if (currVal < bestVal) { m_SplitPoint = currCutPoint; bestVal = currVal; for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } } } } // No missing values in training data. if (numMissing == 0) { System.arraycopy(sum, 0, bestDist[2], 0, m_Instances.numClasses()); } m_Distribution = bestDist; return bestVal; }
From source file:boosting.classifiers.DecisionStumpWritable.java
License:Open Source License
/** * Finds best split for numeric attribute and numeric class * and returns value.//from w w w . j a v a 2 s . c o m * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ private double findSplitNumericNumeric(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal, currCutPoint; int numMissing = 0; double[] sumsSquares = new double[3], sumOfWeights = new double[3]; double[][] bestDist = new double[3][1]; double totalSum = 0, totalSumOfWeights = 0; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (!inst.isMissing(index)) { m_Distribution[1][0] += inst.classValue() * inst.weight(); sumsSquares[1] += inst.classValue() * inst.classValue() * inst.weight(); sumOfWeights[1] += inst.weight(); } else { m_Distribution[2][0] += inst.classValue() * inst.weight(); sumsSquares[2] += inst.classValue() * inst.classValue() * inst.weight(); sumOfWeights[2] += inst.weight(); numMissing++; } totalSumOfWeights += inst.weight(); totalSum += inst.classValue() * inst.weight(); } // Check if the total weight is zero if (totalSumOfWeights <= 0) { return bestVal; } // Sort instances m_Instances.sort(index); // Make split counts for each possible split and evaluate for (int i = 0; i < m_Instances.numInstances() - (numMissing + 1); i++) { Instance inst = m_Instances.instance(i); Instance instPlusOne = m_Instances.instance(i + 1); m_Distribution[0][0] += inst.classValue() * inst.weight(); sumsSquares[0] += inst.classValue() * inst.classValue() * inst.weight(); sumOfWeights[0] += inst.weight(); m_Distribution[1][0] -= inst.classValue() * inst.weight(); sumsSquares[1] -= inst.classValue() * inst.classValue() * inst.weight(); sumOfWeights[1] -= inst.weight(); if (inst.value(index) < instPlusOne.value(index)) { currCutPoint = (inst.value(index) + instPlusOne.value(index)) / 2.0; currVal = variance(m_Distribution, sumsSquares, sumOfWeights); if (currVal < bestVal) { m_SplitPoint = currCutPoint; bestVal = currVal; for (int j = 0; j < 3; j++) { if (sumOfWeights[j] > 0) { bestDist[j][0] = m_Distribution[j][0] / sumOfWeights[j]; } else { bestDist[j][0] = totalSum / totalSumOfWeights; } } } } } m_Distribution = bestDist; return bestVal; }
From source file:boosting.classifiers.DecisionStumpWritable.java
License:Open Source License
/** * Returns the subset an instance falls into. * /*from w w w. j a v a 2 s . c o m*/ * @param instance the instance to check * @return the subset the instance falls into * @throws Exception if something goes wrong */ private int whichSubset(Instance instance) throws Exception { if (instance.isMissing(m_AttIndex)) { return 2; } else if (instance.attribute(m_AttIndex).isNominal()) { if ((int) instance.value(m_AttIndex) == m_SplitPoint) { return 0; } else { return 1; } } else { if (instance.value(m_AttIndex) <= m_SplitPoint) { return 0; } else { return 1; } } }
From source file:br.ufrn.ia.core.clustering.EMIaProject.java
License:Open Source License
private void EM_Init(Instances inst) throws Exception { int i, j, k;/* w w w. ja v a2 s. c om*/ // run k means 10 times and choose best solution SimpleKMeans bestK = null; double bestSqE = Double.MAX_VALUE; for (i = 0; i < 10; i++) { SimpleKMeans sk = new SimpleKMeans(); sk.setSeed(m_rr.nextInt()); sk.setNumClusters(m_num_clusters); sk.setDisplayStdDevs(true); sk.buildClusterer(inst); if (sk.getSquaredError() < bestSqE) { bestSqE = sk.getSquaredError(); bestK = sk; } } // initialize with best k-means solution m_num_clusters = bestK.numberOfClusters(); m_weights = new double[inst.numInstances()][m_num_clusters]; m_model = new DiscreteEstimator[m_num_clusters][m_num_attribs]; m_modelNormal = new double[m_num_clusters][m_num_attribs][3]; m_priors = new double[m_num_clusters]; Instances centers = bestK.getClusterCentroids(); Instances stdD = bestK.getClusterStandardDevs(); double[][][] nominalCounts = bestK.getClusterNominalCounts(); double[] clusterSizes = bestK.getClusterSizes(); for (i = 0; i < m_num_clusters; i++) { Instance center = centers.instance(i); for (j = 0; j < m_num_attribs; j++) { if (inst.attribute(j).isNominal()) { m_model[i][j] = new DiscreteEstimator(m_theInstances.attribute(j).numValues(), true); for (k = 0; k < inst.attribute(j).numValues(); k++) { m_model[i][j].addValue(k, nominalCounts[i][j][k]); } } else { double minStdD = (m_minStdDevPerAtt != null) ? m_minStdDevPerAtt[j] : m_minStdDev; double mean = (center.isMissing(j)) ? inst.meanOrMode(j) : center.value(j); m_modelNormal[i][j][0] = mean; double stdv = (stdD.instance(i).isMissing(j)) ? ((m_maxValues[j] - m_minValues[j]) / (2 * m_num_clusters)) : stdD.instance(i).value(j); if (stdv < minStdD) { stdv = inst.attributeStats(j).numericStats.stdDev; if (Double.isInfinite(stdv)) { stdv = minStdD; } if (stdv < minStdD) { stdv = minStdD; } } if (stdv <= 0) { stdv = m_minStdDev; } m_modelNormal[i][j][1] = stdv; m_modelNormal[i][j][2] = 1.0; } } } for (j = 0; j < m_num_clusters; j++) { // m_priors[j] += 1.0; m_priors[j] = clusterSizes[j]; } Utils.normalize(m_priors); }
From source file:br.ufrn.ia.core.clustering.EMIaProject.java
License:Open Source License
public double[] logDensityPerClusterForInstance(Instance inst) throws Exception { int i, j;//from w w w. j a va 2 s.co m double logprob; double[] wghts = new double[m_num_clusters]; m_replaceMissing.input(inst); inst = m_replaceMissing.output(); for (i = 0; i < m_num_clusters; i++) { // System.err.println("Cluster : "+i); logprob = 0.0; for (j = 0; j < m_num_attribs; j++) { if (!inst.isMissing(j)) { if (inst.attribute(j).isNominal()) { logprob += Math.log(m_model[i][j].getProbability(inst.value(j))); } else { // numeric attribute logprob += logNormalDens(inst.value(j), m_modelNormal[i][j][0], m_modelNormal[i][j][1]); /* * System.err.println(logNormalDens(inst.value(j), * m_modelNormal[i][j][0], m_modelNormal[i][j][1]) + * " "); */ } } } // System.err.println(""); wghts[i] = logprob; } return wghts; }
From source file:br.ufrn.ia.core.clustering.EMIaProject.java
License:Open Source License
private void M(Instances inst) throws Exception { int i, j, l;//ww w .j a v a2 s . c om new_estimators(); for (i = 0; i < m_num_clusters; i++) { for (j = 0; j < m_num_attribs; j++) { for (l = 0; l < inst.numInstances(); l++) { Instance in = inst.instance(l); if (!in.isMissing(j)) { if (inst.attribute(j).isNominal()) { m_model[i][j].addValue(in.value(j), in.weight() * m_weights[l][i]); } else { m_modelNormal[i][j][0] += (in.value(j) * in.weight() * m_weights[l][i]); m_modelNormal[i][j][2] += in.weight() * m_weights[l][i]; m_modelNormal[i][j][1] += (in.value(j) * in.value(j) * in.weight() * m_weights[l][i]); } } } } } // calcualte mean and std deviation for numeric attributes for (j = 0; j < m_num_attribs; j++) { if (!inst.attribute(j).isNominal()) { for (i = 0; i < m_num_clusters; i++) { if (m_modelNormal[i][j][2] <= 0) { m_modelNormal[i][j][1] = Double.MAX_VALUE; // m_modelNormal[i][j][0] = 0; m_modelNormal[i][j][0] = m_minStdDev; } else { // variance m_modelNormal[i][j][1] = (m_modelNormal[i][j][1] - (m_modelNormal[i][j][0] * m_modelNormal[i][j][0] / m_modelNormal[i][j][2])) / (m_modelNormal[i][j][2]); if (m_modelNormal[i][j][1] < 0) { m_modelNormal[i][j][1] = 0; } // std dev double minStdD = (m_minStdDevPerAtt != null) ? m_minStdDevPerAtt[j] : m_minStdDev; m_modelNormal[i][j][1] = Math.sqrt(m_modelNormal[i][j][1]); if ((m_modelNormal[i][j][1] <= minStdD)) { m_modelNormal[i][j][1] = inst.attributeStats(j).numericStats.stdDev; if ((m_modelNormal[i][j][1] <= minStdD)) { m_modelNormal[i][j][1] = minStdD; } } if ((m_modelNormal[i][j][1] <= 0)) { m_modelNormal[i][j][1] = m_minStdDev; } if (Double.isInfinite(m_modelNormal[i][j][1])) { m_modelNormal[i][j][1] = m_minStdDev; } // mean m_modelNormal[i][j][0] /= m_modelNormal[i][j][2]; } } } } }
From source file:br.ufrn.ia.core.clustering.EMIaProject.java
License:Open Source License
private void updateMinMax(Instance instance) { for (int j = 0; j < m_theInstances.numAttributes(); j++) { if (!instance.isMissing(j)) { if (Double.isNaN(m_minValues[j])) { m_minValues[j] = instance.value(j); m_maxValues[j] = instance.value(j); } else { if (instance.value(j) < m_minValues[j]) { m_minValues[j] = instance.value(j); } else { if (instance.value(j) > m_maxValues[j]) { m_maxValues[j] = instance.value(j); }/*w w w . ja v a2 s . com*/ } } } } }
From source file:cba.ItemSet.java
License:Open Source License
/** * Checks if an instance contains an item set. * * @param instance the instance to be tested * @return true if the given instance contains this item set *//*from w w w . j a v a 2 s.c o m*/ public boolean containedBy(Instance instance) { if (instance instanceof weka.core.SparseInstance && m_treatZeroAsMissing) { int numInstVals = instance.numValues(); int numItemSetVals = m_items.length; for (int p1 = 0, p2 = 0; p1 < numInstVals || p2 < numItemSetVals;) { int instIndex = Integer.MAX_VALUE; if (p1 < numInstVals) { instIndex = instance.index(p1); } int itemIndex = p2; if (m_items[itemIndex] > -1) { if (itemIndex != instIndex) { return false; } else { if (instance.isMissingSparse(p1)) { return false; } if (m_items[itemIndex] != (int) instance.valueSparse(p1)) { return false; } } p1++; p2++; } else { if (itemIndex < instIndex) { p2++; } else if (itemIndex == instIndex) { p2++; p1++; } } } } else { for (int i = 0; i < instance.numAttributes(); i++) if (m_items[i] > -1) { if (instance.isMissing(i) || (m_treatZeroAsMissing && (int) instance.value(i) == 0)) return false; if (m_items[i] != (int) instance.value(i)) return false; } } return true; }