List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java
License:Open Source License
/** * builds the classifier// w w w .j a v a 2 s. co m * * @param data the training instances * @throws Exception if something goes wrong */ @Override public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // save original header (needed for clusters to classes output) m_OriginalHeader = data.stringFreeStructure(); // remove class attribute for clusterer Instances clusterData = new Instances(data); clusterData.setClassIndex(-1); clusterData.deleteAttributeAt(data.classIndex()); m_ClusteringHeader = clusterData.stringFreeStructure(); if (m_ClusteringHeader.numAttributes() == 0) { System.err.println("Data contains only class attribute, defaulting to ZeroR model."); m_ZeroR = new ZeroR(); m_ZeroR.buildClassifier(data); } else { m_ZeroR = null; // build clusterer m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer); m_ActualClusterer.buildClusterer(clusterData); if (!getLabelAllClusters()) { // determine classes-to-clusters mapping ClusterEvaluation eval = new ClusterEvaluation(); eval.setClusterer(m_ActualClusterer); eval.evaluateClusterer(clusterData); double[] clusterAssignments = eval.getClusterAssignments(); int[][] counts = new int[eval.getNumClusters()][m_OriginalHeader.numClasses()]; int[] clusterTotals = new int[eval.getNumClusters()]; double[] best = new double[eval.getNumClusters() + 1]; double[] current = new double[eval.getNumClusters() + 1]; for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); if (!instance.classIsMissing()) { counts[(int) clusterAssignments[i]][(int) instance.classValue()]++; clusterTotals[(int) clusterAssignments[i]]++; } } best[eval.getNumClusters()] = Double.MAX_VALUE; ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0); m_ClustersToClasses = new double[best.length]; System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length); } else { m_ClusterClassProbs = new double[m_ActualClusterer.numberOfClusters()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance clusterInstance = clusterData.instance(i); Instance originalInstance = data.instance(i); if (!originalInstance.classIsMissing()) { double[] probs = m_ActualClusterer.distributionForInstance(clusterInstance); for (int j = 0; j < probs.length; j++) { m_ClusterClassProbs[j][(int) originalInstance.classValue()] += probs[j]; } } } for (int i = 0; i < m_ClusterClassProbs.length; i++) { Utils.normalize(m_ClusterClassProbs[i]); } } } }
From source file:com.tum.classifiertest.DataCache.java
License:Open Source License
/** * Creates a DataCache by copying data from a weka.core.Instances object. *//* w ww .ja v a2 s. c o m*/ public DataCache(Instances origData) throws Exception { classIndex = origData.classIndex(); numAttributes = origData.numAttributes(); numClasses = origData.numClasses(); numInstances = origData.numInstances(); attNumVals = new int[origData.numAttributes()]; for (int i = 0; i < attNumVals.length; i++) { if (origData.attribute(i).isNumeric()) { attNumVals[i] = 0; } else if (origData.attribute(i).isNominal()) { attNumVals[i] = origData.attribute(i).numValues(); } else throw new Exception("Only numeric and nominal attributes are supported."); } /* Array is indexed by attribute first, to speed access in RF splitting. */ vals = new float[numAttributes][numInstances]; for (int a = 0; a < numAttributes; a++) { for (int i = 0; i < numInstances; i++) { if (origData.instance(i).isMissing(a)) vals[a][i] = Float.MAX_VALUE; // to make sure missing values go to the end else vals[a][i] = (float) origData.instance(i).value(a); // deep copy } } instWeights = new double[numInstances]; instClassValues = new int[numInstances]; for (int i = 0; i < numInstances; i++) { instWeights[i] = origData.instance(i).weight(); instClassValues[i] = (int) origData.instance(i).classValue(); } /* compute the sortedInstances for the whole dataset */ sortedIndices = new int[numAttributes][]; for (int a = 0; a < numAttributes; a++) { // ================= attr by attr if (a == classIndex) continue; if (attNumVals[a] > 0) { // ------------------------------------- nominal // Handling nominal attributes: as of FastRF 0.99, they're sorted as well // missing values are coded as Float.MAX_VALUE and go to the end sortedIndices[a] = new int[numInstances]; //int count = 0; sortedIndices[a] = FastRfUtils.sort(vals[a]); /*for (int i = 0; i < numInstances; i++) { if ( !this.isValueMissing(a, i) ) { sortedIndices[a][count] = i; count++; } } for (int i = 0; i < numInstances; i++) { if ( this.isValueMissing(a, i) ) { sortedIndices[a][count] = i; count++; } }*/ } else { // ----------------------------------------------------- numeric // Sorted indices are computed for numeric attributes // missing values are coded as Float.MAX_VALUE and go to the end sortedIndices[a] = FastRfUtils.sort(vals[a]); } // ---------------------------------------------------------- attr kind } // ========================================================= attr by attr // System.out.println(" Done."); }
From source file:com.tum.classifiertest.FastRfBagging.java
License:Open Source License
/** * Bagging method. Produces DataCache objects with bootstrap samples of * the original data, and feeds them to the base classifier (which can only * be a FastRandomTree)./* w w w . java 2 s .co m*/ * * @param data The training set to be used for generating the * bagged classifier. * @param numThreads The number of simultaneous threads to use for * computation. Pass zero (0) for autodetection. * @param motherForest A reference to the FastRandomForest object that * invoked this. * * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data, int numThreads, FastRandomForest motherForest) throws Exception { // can classifier handle the vals? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); if (!(m_Classifier instanceof FastRandomTree)) throw new IllegalArgumentException( "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier."); /* We fill the m_Classifiers array by creating lots of trees with new() * because this is much faster than using serialization to deep-copy the * one tree in m_Classifier - this is what the super.buildClassifier(data) * normally does. */ m_Classifiers = new Classifier[m_NumIterations]; for (int i = 0; i < m_Classifiers.length; i++) { FastRandomTree curTree = new FastRandomTree(); // all parameters for training will be looked up in the motherForest (maxDepth, k_Value) curTree.m_MotherForest = motherForest; // 0.99: reference to these arrays will get passed down all nodes so the array can be re-used // 0.99: this array is of size two as now all splits are binary - even categorical ones curTree.tempProps = new double[2]; curTree.tempDists = new double[2][]; curTree.tempDists[0] = new double[data.numClasses()]; curTree.tempDists[1] = new double[data.numClasses()]; curTree.tempDistsOther = new double[2][]; curTree.tempDistsOther[0] = new double[data.numClasses()]; curTree.tempDistsOther[1] = new double[data.numClasses()]; m_Classifiers[i] = curTree; } // this was SLOW.. takes approx 1/2 time as training the forest afterwards (!!!) // super.buildClassifier(data); if (m_CalcOutOfBag && (m_BagSizePercent != 100)) { throw new IllegalArgumentException( "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!"); } // sorting is performed inside this constructor DataCache myData = new DataCache(data); int bagSize = data.numInstances() * m_BagSizePercent / 100; Random random = new Random(m_Seed); boolean[][] inBag = new boolean[m_Classifiers.length][]; // thread management ExecutorService threadPool = Executors .newFixedThreadPool(numThreads > 0 ? numThreads : Runtime.getRuntime().availableProcessors()); List<Future<?>> futures = new ArrayList<Future<?>>(m_Classifiers.length); try { for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) { // create the in-bag dataset (and be sure to remember what's in bag) // for computing the out-of-bag error later DataCache bagData = myData.resample(bagSize, random); bagData.reusableRandomGenerator = bagData.getRandomNumberGenerator(random.nextInt()); inBag[treeIdx] = bagData.inBag; // store later for OOB error calculation // build the classifier if (m_Classifiers[treeIdx] instanceof FastRandomTree) { FastRandomTree aTree = (FastRandomTree) m_Classifiers[treeIdx]; aTree.data = bagData; Future<?> future = threadPool.submit(aTree); futures.add(future); } else { throw new IllegalArgumentException( "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier."); } } // make sure all trees have been trained before proceeding for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) { futures.get(treeIdx).get(); } // calc OOB error? if (getCalcOutOfBag() || getComputeImportances()) { //m_OutOfBagError = computeOOBError(data, inBag, threadPool); m_OutOfBagError = computeOOBError(myData, inBag, threadPool); } else { m_OutOfBagError = 0; } //calc feature importances m_FeatureImportances = null; //m_FeatureNames = null; if (getComputeImportances()) { m_FeatureImportances = new double[data.numAttributes()]; ///m_FeatureNames = new String[data.numAttributes()]; //Instances dataCopy = new Instances(data); //To scramble //int[] permutation = FastRfUtils.randomPermutation(data.numInstances(), random); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { //double sError = computeOOBError(FastRfUtils.scramble(data, dataCopy, j, permutation), inBag, threadPool); //double sError = computeOOBError(data, inBag, threadPool, j, 0); float[] unscrambled = myData.scrambleOneAttribute(j, random); double sError = computeOOBError(myData, inBag, threadPool); myData.vals[j] = unscrambled; // restore the original state m_FeatureImportances[j] = sError - m_OutOfBagError; } //m_FeatureNames[j] = data.attribute(j).name(); } } threadPool.shutdown(); } finally { threadPool.shutdownNow(); } }
From source file:com.tum.classifiertest.FastRfBagging.java
License:Open Source License
/** * Compute the out-of-bag error for a set of instances. * * @param data the instances/*from w ww . ja v a2s . co m*/ * @param inBag numTrees x numInstances indicating out-of-bag instances * @param threadPool the pool of threads * * @return the oob error */ private double computeOOBError(Instances data, boolean[][] inBag, ExecutorService threadPool) throws InterruptedException, ExecutionException { boolean numeric = data.classAttribute().isNumeric(); List<Future<Double>> votes = new ArrayList<Future<Double>>(data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { VotesCollector aCollector = new VotesCollector(m_Classifiers, i, data, inBag); votes.add(threadPool.submit(aCollector)); } double outOfBagCount = 0.0; double errorSum = 0.0; for (int i = 0; i < data.numInstances(); i++) { double vote = votes.get(i).get(); // error for instance outOfBagCount += data.instance(i).weight(); if (numeric) { errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight(); } else { if (vote != data.instance(i).classValue()) errorSum += data.instance(i).weight(); } } return errorSum / outOfBagCount; }
From source file:com.tum.classifiertest.FastRfUtils.java
License:Open Source License
/** * Produces a random permutation of the values of an attribute in a dataset using Knuth shuffle. * <p/>// w ww. ja va2 s . co m * Copies back the current values of the previously scrambled attribute and uses the given permutation * to scramble the values of the new attribute all by copying from the original dataset. * * @param src the source dataset * @param dst the scrambled dataset * @param attIndex the attribute index * @param perm the random permutation * * @return fluent */ public static Instances scramble(Instances src, Instances dst, final int attIndex, int[] perm) { for (int i = 0; i < src.numInstances(); i++) { Instance scrambled = dst.instance(i); if (attIndex > 0) scrambled.setValue(attIndex - 1, src.instance(i).value(attIndex - 1)); scrambled.setValue(attIndex, src.instance(perm[i]).value(attIndex)); } return dst; }
From source file:com.yahoo.labs.samoa.instances.WekaToSamoaInstanceConverter.java
License:Apache License
/** * Samoa instances from weka instances.//from w w w .j ava2s . c o m * * @param instances the instances * @return the instances */ public Instances samoaInstances(weka.core.Instances instances) { Instances samoaInstances = samoaInstancesInformation(instances); //We assume that we have only one samoaInstanceInformation for WekaToSamoaInstanceConverter this.samoaInstanceInformation = samoaInstances; for (int i = 0; i < instances.numInstances(); i++) { samoaInstances.add(samoaInstance(instances.instance(i))); } return samoaInstances; }
From source file:com.zooclassifier.Model.FileLoader.java
public FileLoader(String filename) throws FileNotFoundException, IOException { BufferedReader reader = new BufferedReader(new FileReader(filename)); ArffLoader.ArffReader arff = new ArffLoader.ArffReader(reader); Instances data = arff.getData(); data.setClassIndex(data.numAttributes() - 1); attributes = new String[data.numInstances()][data.numAttributes() - 1]; labels = new String[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); for (int j = 0; j < instance.numAttributes() - 1; j++) { attributes[i][j] = instance.stringValue(j); }/*from w w w. j a v a 2s. c om*/ labels[i] = instance.stringValue(instance.numAttributes() - 1); } attributesLegalValues = new String[data.numAttributes() - 1][]; for (int i = 0; i < data.numAttributes() - 1; i++) { attributesLegalValues[i] = (String[]) Collections.list(data.attribute(i).enumerateValues()) .toArray(new String[data.attribute(i).numValues()]); } labelsLegalValues = (String[]) Collections.list(data.attribute(data.numAttributes() - 1).enumerateValues()) .toArray(new String[data.attribute(data.numAttributes() - 1).numValues()]); }
From source file:core.classifier.MyFirstClassifier.java
License:Open Source License
/** * Method for building the classifier. Implements a one-against-one * wrapper for multi-class problems.//from w w w. ja va2s .c om * * @param insts the set of training instances * @throws Exception if the classifier can't be built successfully */ public void buildClassifier(Instances insts) throws Exception { if (!m_checksTurnedOff) { // can classifier handle the data? getCapabilities().testWithFail(insts); // remove instances with missing class insts = new Instances(insts); insts.deleteWithMissingClass(); /* Removes all the instances with weight equal to 0. MUST be done since condition (8) of Keerthi's paper is made with the assertion Ci > 0 (See equation (3a). */ Instances data = new Instances(insts, insts.numInstances()); for (int i = 0; i < insts.numInstances(); i++) { if (insts.instance(i).weight() > 0) data.add(insts.instance(i)); } if (data.numInstances() == 0) { throw new Exception("No training instances left after removing " + "instances with weight 0!"); } insts = data; } if (!m_checksTurnedOff) { m_Missing = new ReplaceMissingValues(); m_Missing.setInputFormat(insts); insts = Filter.useFilter(insts, m_Missing); } else { m_Missing = null; } if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) { boolean onlyNumeric = true; if (!m_checksTurnedOff) { for (int i = 0; i < insts.numAttributes(); i++) { if (i != insts.classIndex()) { if (!insts.attribute(i).isNumeric()) { onlyNumeric = false; break; } } } } if (!onlyNumeric) { m_NominalToBinary = new NominalToBinary(); m_NominalToBinary.setInputFormat(insts); insts = Filter.useFilter(insts, m_NominalToBinary); } else { m_NominalToBinary = null; } } else { m_NominalToBinary = null; } if (m_filterType == FILTER_STANDARDIZE) { m_Filter = new Standardize(); m_Filter.setInputFormat(insts); insts = Filter.useFilter(insts, m_Filter); } else if (m_filterType == FILTER_NORMALIZE) { m_Filter = new Normalize(); m_Filter.setInputFormat(insts); insts = Filter.useFilter(insts, m_Filter); } else { m_Filter = null; } m_classIndex = insts.classIndex(); m_classAttribute = insts.classAttribute(); m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0); // Generate subsets representing each class Instances[] subsets = new Instances[insts.numClasses()]; for (int i = 0; i < insts.numClasses(); i++) { subsets[i] = new Instances(insts, insts.numInstances()); } for (int j = 0; j < insts.numInstances(); j++) { Instance inst = insts.instance(j); subsets[(int) inst.classValue()].add(inst); } for (int i = 0; i < insts.numClasses(); i++) { subsets[i].compactify(); } // Build the binary classifiers Random rand = new Random(m_randomSeed); m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()]; for (int i = 0; i < insts.numClasses(); i++) { for (int j = i + 1; j < insts.numClasses(); j++) { m_classifiers[i][j] = new BinarySMO(); m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel())); Instances data = new Instances(insts, insts.numInstances()); for (int k = 0; k < subsets[i].numInstances(); k++) { data.add(subsets[i].instance(k)); } for (int k = 0; k < subsets[j].numInstances(); k++) { data.add(subsets[j].instance(k)); } data.compactify(); data.randomize(rand); m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed); } } }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
public Instances DeleteNoise(Instances data) { noise = data.stringFreeStructure();// w ww . j a v a 2s. c om for (int i = 0; i < data.numInstances(); i++) { if (data.instance(i).value(1) == -1) { noise.add(data.instance(i)); data.delete(i); i--; } } return data; }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
/** * Perform a cross-validation for DensityBasedClusterer on a set of instances. * * @param clusterer the clusterer to use * @param data the training data//from www .j ava 2s . co m * @param numFolds number of folds of cross validation to perform * @param random random number seed for cross-validation * @return the cross-validated log-likelihood * @throws Exception if an error occurs */ public static double crossValidateModel(DensityBasedClusterer clusterer, Instances data, int numFolds, Random random) throws Exception { Instances train, test; double foldAv = 0; ; data = new Instances(data); data.randomize(random); // double sumOW = 0; for (int i = 0; i < numFolds; i++) { // Build and test clusterer train = data.trainCV(numFolds, i, random); clusterer.buildClusterer(train); test = data.testCV(numFolds, i); for (int j = 0; j < test.numInstances(); j++) { try { foldAv += ((DensityBasedClusterer) clusterer).logDensityForInstance(test.instance(j)); // sumOW += test.instance(j).weight(); // double temp = Utils.sum(tempDist); } catch (Exception ex) { // unclustered instances } } } // return foldAv / sumOW; return foldAv / data.numInstances(); }