List of usage examples for weka.core Instances classIndex
publicint classIndex()
From source file:com.tum.classifiertest.FastRfBagging.java
License:Open Source License
/** * Bagging method. Produces DataCache objects with bootstrap samples of * the original data, and feeds them to the base classifier (which can only * be a FastRandomTree).//from ww w . ja v a 2s . c o m * * @param data The training set to be used for generating the * bagged classifier. * @param numThreads The number of simultaneous threads to use for * computation. Pass zero (0) for autodetection. * @param motherForest A reference to the FastRandomForest object that * invoked this. * * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data, int numThreads, FastRandomForest motherForest) throws Exception { // can classifier handle the vals? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); if (!(m_Classifier instanceof FastRandomTree)) throw new IllegalArgumentException( "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier."); /* We fill the m_Classifiers array by creating lots of trees with new() * because this is much faster than using serialization to deep-copy the * one tree in m_Classifier - this is what the super.buildClassifier(data) * normally does. */ m_Classifiers = new Classifier[m_NumIterations]; for (int i = 0; i < m_Classifiers.length; i++) { FastRandomTree curTree = new FastRandomTree(); // all parameters for training will be looked up in the motherForest (maxDepth, k_Value) curTree.m_MotherForest = motherForest; // 0.99: reference to these arrays will get passed down all nodes so the array can be re-used // 0.99: this array is of size two as now all splits are binary - even categorical ones curTree.tempProps = new double[2]; curTree.tempDists = new double[2][]; curTree.tempDists[0] = new double[data.numClasses()]; curTree.tempDists[1] = new double[data.numClasses()]; curTree.tempDistsOther = new double[2][]; curTree.tempDistsOther[0] = new double[data.numClasses()]; curTree.tempDistsOther[1] = new double[data.numClasses()]; m_Classifiers[i] = curTree; } // this was SLOW.. takes approx 1/2 time as training the forest afterwards (!!!) // super.buildClassifier(data); if (m_CalcOutOfBag && (m_BagSizePercent != 100)) { throw new IllegalArgumentException( "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!"); } // sorting is performed inside this constructor DataCache myData = new DataCache(data); int bagSize = data.numInstances() * m_BagSizePercent / 100; Random random = new Random(m_Seed); boolean[][] inBag = new boolean[m_Classifiers.length][]; // thread management ExecutorService threadPool = Executors .newFixedThreadPool(numThreads > 0 ? numThreads : Runtime.getRuntime().availableProcessors()); List<Future<?>> futures = new ArrayList<Future<?>>(m_Classifiers.length); try { for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) { // create the in-bag dataset (and be sure to remember what's in bag) // for computing the out-of-bag error later DataCache bagData = myData.resample(bagSize, random); bagData.reusableRandomGenerator = bagData.getRandomNumberGenerator(random.nextInt()); inBag[treeIdx] = bagData.inBag; // store later for OOB error calculation // build the classifier if (m_Classifiers[treeIdx] instanceof FastRandomTree) { FastRandomTree aTree = (FastRandomTree) m_Classifiers[treeIdx]; aTree.data = bagData; Future<?> future = threadPool.submit(aTree); futures.add(future); } else { throw new IllegalArgumentException( "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier."); } } // make sure all trees have been trained before proceeding for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) { futures.get(treeIdx).get(); } // calc OOB error? if (getCalcOutOfBag() || getComputeImportances()) { //m_OutOfBagError = computeOOBError(data, inBag, threadPool); m_OutOfBagError = computeOOBError(myData, inBag, threadPool); } else { m_OutOfBagError = 0; } //calc feature importances m_FeatureImportances = null; //m_FeatureNames = null; if (getComputeImportances()) { m_FeatureImportances = new double[data.numAttributes()]; ///m_FeatureNames = new String[data.numAttributes()]; //Instances dataCopy = new Instances(data); //To scramble //int[] permutation = FastRfUtils.randomPermutation(data.numInstances(), random); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { //double sError = computeOOBError(FastRfUtils.scramble(data, dataCopy, j, permutation), inBag, threadPool); //double sError = computeOOBError(data, inBag, threadPool, j, 0); float[] unscrambled = myData.scrambleOneAttribute(j, random); double sError = computeOOBError(myData, inBag, threadPool); myData.vals[j] = unscrambled; // restore the original state m_FeatureImportances[j] = sError - m_OutOfBagError; } //m_FeatureNames[j] = data.attribute(j).name(); } } threadPool.shutdown(); } finally { threadPool.shutdownNow(); } }
From source file:com.tum.classifiertest.FastRfUtils.java
License:Open Source License
/** * Load a dataset into memory./*ww w . j ava 2 s. c om*/ * * @param location the location of the dataset * * @return the dataset */ public static Instances readInstances(String location) throws Exception { Instances data = new weka.core.converters.ConverterUtils.DataSource(location).getDataSet(); if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1); return data; }
From source file:com.yahoo.labs.samoa.instances.WekaToSamoaInstanceConverter.java
License:Apache License
/** * Samoa instances information./*from w ww .jav a2s. c o m*/ * * @param instances the instances * @return the instances */ public Instances samoaInstancesInformation(weka.core.Instances instances) { Instances samoaInstances; List<Attribute> attInfo = new ArrayList<Attribute>(); for (int i = 0; i < instances.numAttributes(); i++) { attInfo.add(samoaAttribute(i, instances.attribute(i))); } samoaInstances = new Instances(instances.relationName(), attInfo, 0); samoaInstances.setClassIndex(instances.classIndex()); return samoaInstances; }
From source file:com.zazhu.BlueHub.BlueHub.java
License:Apache License
/** * receives the last reads from the sensors and creates the features we use * only the acc x,y,z (either from internal or external sensor) * /*w w w .j a va 2 s .c o m*/ * @param sensorQueue * @throws Exception */ private Instance processingSenseData(Queue<String> sensorQueue, char whatSensor) throws Exception { BufferedReader reader; Instances format; Instance newInstance = null; Log.d(TAG, "Queue size = " + mQueueSize); if (sensorQueue.size() <= 0) throw new Exception("Queue empty"); // create the arrays that will contain the accelerometer data // s.x s.y s.z double[] sx = new double[sensorQueue.size()]; double[] sy = new double[sensorQueue.size()]; double[] sz = new double[sensorQueue.size()]; String rawReading; StringTokenizer st; int index; if (D) Log.e(TAG, "+++ COMPUTING FEATURES +++"); // 1. collect raw data. what kind of sensing data? external vs. internal switch (whatSensor) { case EXTERNAL: index = 0; while ((rawReading = sensorQueue.poll()) != null) { // FORMAT: // "Time_SensorName_SensorNumber_Counter_Xacc_Yacc_Zacc_Xgyro_Ygyro_checksum" // position of the values needed: s.x = 4, s.y = 5, s.z = 6 st = new StringTokenizer(rawReading, FIELD_SEP); // not needed data for (int i = 0; i < 4; i++) st.nextToken(); // s.x, s.y, s.z sx[index] = Double.valueOf(st.nextToken()); sy[index] = Double.valueOf(st.nextToken()); sz[index] = Double.valueOf(st.nextToken()); index += 1; } // 2. process raw data // 2.1 read the input format for the instance (TODO must be changed to // use weka classes) reader = new BufferedReader(new InputStreamReader(getResources().openRawResource(R.raw.format_extern))); try { format = new Instances(reader); if (format.classIndex() == -1) format.setClassIndex(format.numAttributes() - 1); // 2.2 create a new instance newInstance = new DenseInstance(7); newInstance.setDataset(format); // set attributes newInstance.setValue(format.attribute(0), Feature.getStd(sx)); newInstance.setValue(format.attribute(1), Feature.getStd(sy)); newInstance.setValue(format.attribute(2), Feature.getStd(sz)); newInstance.setValue(format.attribute(3), Feature.getMean(sx)); newInstance.setValue(format.attribute(4), Feature.getMean(sy)); newInstance.setValue(format.attribute(5), Feature.getMean(sz)); // set unknown class newInstance.setMissing(format.attribute(6)); } catch (IOException e) { e.printStackTrace(); } break; case INTERNAL: index = 0; while ((rawReading = sensorQueue.poll()) != null) { // FORMAT "Xacc_Yacc_Zacc" // position of the values needed: s.x = 0, s.y = 1, s.z = 2 st = new StringTokenizer(rawReading, FIELD_SEP); // s.x, s.y, s.z sx[index] = Double.valueOf(st.nextToken()); sy[index] = Double.valueOf(st.nextToken()); sz[index] = Double.valueOf(st.nextToken()); index += 1; } // 2. process raw data // 2.1 read the input format for the instance (TODO must be changed to // use weka classes) reader = new BufferedReader(new InputStreamReader(getResources().openRawResource(R.raw.format_intern))); try { format = new Instances(reader); if (format.classIndex() == -1) format.setClassIndex(format.numAttributes() - 1); // 2.2 create a new instance newInstance = new DenseInstance(7); newInstance.setDataset(format); // set attributes newInstance.setValue(format.attribute(0), Feature.getStd(sx)); newInstance.setValue(format.attribute(1), Feature.getStd(sy)); newInstance.setValue(format.attribute(2), Feature.getStd(sz)); newInstance.setValue(format.attribute(3), Feature.getMean(sx)); newInstance.setValue(format.attribute(4), Feature.getMean(sy)); newInstance.setValue(format.attribute(5), Feature.getMean(sz)); // set unknown class newInstance.setMissing(format.attribute(6)); } catch (IOException e) { e.printStackTrace(); } break; default: if (D) Log.e(TAG, "+++ COMPUTING FEATURES: NO VALUE FOR THE SENSOR READING +++"); break; } return newInstance; }
From source file:core.classifier.MyFirstClassifier.java
License:Open Source License
/** * Method for building the classifier. Implements a one-against-one * wrapper for multi-class problems./* ww w .ja v a 2 s. co m*/ * * @param insts the set of training instances * @throws Exception if the classifier can't be built successfully */ public void buildClassifier(Instances insts) throws Exception { if (!m_checksTurnedOff) { // can classifier handle the data? getCapabilities().testWithFail(insts); // remove instances with missing class insts = new Instances(insts); insts.deleteWithMissingClass(); /* Removes all the instances with weight equal to 0. MUST be done since condition (8) of Keerthi's paper is made with the assertion Ci > 0 (See equation (3a). */ Instances data = new Instances(insts, insts.numInstances()); for (int i = 0; i < insts.numInstances(); i++) { if (insts.instance(i).weight() > 0) data.add(insts.instance(i)); } if (data.numInstances() == 0) { throw new Exception("No training instances left after removing " + "instances with weight 0!"); } insts = data; } if (!m_checksTurnedOff) { m_Missing = new ReplaceMissingValues(); m_Missing.setInputFormat(insts); insts = Filter.useFilter(insts, m_Missing); } else { m_Missing = null; } if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) { boolean onlyNumeric = true; if (!m_checksTurnedOff) { for (int i = 0; i < insts.numAttributes(); i++) { if (i != insts.classIndex()) { if (!insts.attribute(i).isNumeric()) { onlyNumeric = false; break; } } } } if (!onlyNumeric) { m_NominalToBinary = new NominalToBinary(); m_NominalToBinary.setInputFormat(insts); insts = Filter.useFilter(insts, m_NominalToBinary); } else { m_NominalToBinary = null; } } else { m_NominalToBinary = null; } if (m_filterType == FILTER_STANDARDIZE) { m_Filter = new Standardize(); m_Filter.setInputFormat(insts); insts = Filter.useFilter(insts, m_Filter); } else if (m_filterType == FILTER_NORMALIZE) { m_Filter = new Normalize(); m_Filter.setInputFormat(insts); insts = Filter.useFilter(insts, m_Filter); } else { m_Filter = null; } m_classIndex = insts.classIndex(); m_classAttribute = insts.classAttribute(); m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0); // Generate subsets representing each class Instances[] subsets = new Instances[insts.numClasses()]; for (int i = 0; i < insts.numClasses(); i++) { subsets[i] = new Instances(insts, insts.numInstances()); } for (int j = 0; j < insts.numInstances(); j++) { Instance inst = insts.instance(j); subsets[(int) inst.classValue()].add(inst); } for (int i = 0; i < insts.numClasses(); i++) { subsets[i].compactify(); } // Build the binary classifiers Random rand = new Random(m_randomSeed); m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()]; for (int i = 0; i < insts.numClasses(); i++) { for (int j = i + 1; j < insts.numClasses(); j++) { m_classifiers[i][j] = new BinarySMO(); m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel())); Instances data = new Instances(insts, insts.numInstances()); for (int k = 0; k < subsets[i].numInstances(); k++) { data.add(subsets[i].instance(k)); } for (int k = 0; k < subsets[j].numInstances(); k++) { data.add(subsets[j].instance(k)); } data.compactify(); data.randomize(rand); m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed); } } }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
/** * Evaluate the clusterer on a set of instances. Calculates clustering * statistics and stores cluster assigments for the instances in * m_clusterAssignments// w w w .j a v a 2 s .com * * @param test the set of instances to cluster * @param testFileName the name of the test file for incremental testing, * if "" or null then not used * @param outputModel true if the clustering model is to be output as well * as the stats * * @throws Exception if something goes wrong */ public void evaluateClusterer(Instances test, String testFileName, boolean outputModel) throws Exception { int i = 0; int cnum; double loglk = 0.0; int cc = m_Clusterer.numberOfClusters(); m_numClusters = cc; double[] instanceStats = new double[cc]; Instances testRaw = null; boolean hasClass = (test.classIndex() >= 0); int unclusteredInstances = 0; Vector<Double> clusterAssignments = new Vector<Double>(); Filter filter = null; DataSource source = null; Instance inst; if (testFileName == null) testFileName = ""; // load data if (testFileName.length() != 0) source = new DataSource(testFileName); else source = new DataSource(test); testRaw = source.getStructure(test.classIndex()); // If class is set then do class based evaluation as well if (hasClass) { if (testRaw.classAttribute().isNumeric()) throw new Exception("ClusterEvaluation: Class must be nominal!"); filter = new Remove(); ((Remove) filter).setAttributeIndices("" + (testRaw.classIndex() + 1)); ((Remove) filter).setInvertSelection(false); filter.setInputFormat(testRaw); } i = 0; while (source.hasMoreElements(testRaw)) { // next instance inst = source.nextElement(testRaw); if (filter != null) { filter.input(inst); filter.batchFinished(); inst = filter.output(); } cnum = -1; try { if (m_Clusterer instanceof DensityBasedClusterer) { loglk += ((DensityBasedClusterer) m_Clusterer).logDensityForInstance(inst); cnum = m_Clusterer.clusterInstance(inst); clusterAssignments.add((double) cnum); } else { cnum = m_Clusterer.clusterInstance(inst); clusterAssignments.add((double) cnum); } } catch (Exception e) { clusterAssignments.add(-1.0); unclusteredInstances++; } if (cnum != -1) { instanceStats[cnum]++; } } double sum = Utils.sum(instanceStats); loglk /= sum; m_logL = loglk; m_clusterAssignments = new double[clusterAssignments.size()]; for (i = 0; i < clusterAssignments.size(); i++) { m_clusterAssignments[i] = clusterAssignments.get(i); } int numInstFieldWidth = (int) ((Math.log(clusterAssignments.size()) / Math.log(10)) + 1); if (outputModel) { m_clusteringResults.append(m_Clusterer.toString()); } m_clusteringResults.append("Clustered Instances\n\n"); int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1); for (i = 0; i < cc; i++) { if (instanceStats[i] > 0) m_clusteringResults.append(Utils.doubleToString((double) i, clustFieldWidth, 0) + " " + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) + " (" + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0) + "%)\n"); } if (unclusteredInstances > 0) m_clusteringResults.append("\nUnclustered instances : " + unclusteredInstances); if (m_Clusterer instanceof DensityBasedClusterer) m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n"); if (hasClass) { evaluateClustersWithRespectToClass(test, testFileName); } }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
/** * Evaluates cluster assignments with respect to actual class labels. * Assumes that m_Clusterer has been trained and tested on * inst (minus the class).//from w ww . j av a 2s. c o m * * @param inst the instances (including class) to evaluate with respect to * @param fileName the name of the test file for incremental testing, * if "" or null then not used * @throws Exception if something goes wrong */ private void evaluateClustersWithRespectToClass(Instances inst, String fileName) throws Exception { int numClasses = inst.classAttribute().numValues(); int[][] counts = new int[m_numClusters][numClasses]; int[] clusterTotals = new int[m_numClusters]; double[] best = new double[m_numClusters + 1]; double[] current = new double[m_numClusters + 1]; DataSource source = null; Instances instances = null; Instance instance = null; int i; int numInstances; if (fileName == null) fileName = ""; if (fileName.length() != 0) { source = new DataSource(fileName); } else source = new DataSource(inst); instances = source.getStructure(inst.classIndex()); i = 0; while (source.hasMoreElements(instances)) { instance = source.nextElement(instances); if (m_clusterAssignments[i] >= 0) { counts[(int) m_clusterAssignments[i]][(int) instance.classValue()]++; clusterTotals[(int) m_clusterAssignments[i]]++; } i++; } numInstances = i; best[m_numClusters] = Double.MAX_VALUE; mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0); m_clusteringResults.append("\n\nClass attribute: " + inst.classAttribute().name() + "\n"); m_clusteringResults.append("Classes to Clusters:\n"); String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0)); m_clusteringResults.append(matrixString).append("\n"); int Cwidth = 1 + (int) (Math.log(m_numClusters) / Math.log(10)); // add the minimum error assignment for (i = 0; i < m_numClusters; i++) { if (clusterTotals[i] > 0) { m_clusteringResults.append("Cluster " + Utils.doubleToString((double) i, Cwidth, 0)); m_clusteringResults.append(" <-- "); if (best[i] < 0) { m_clusteringResults.append("No class\n"); } else { m_clusteringResults.append(inst.classAttribute().value((int) best[i])).append("\n"); } } } m_clusteringResults.append("\nIncorrectly clustered instances :\t" + best[m_numClusters] + "\t" + (Utils.doubleToString((best[m_numClusters] / numInstances * 100.0), 8, 4)) + " %\n"); // copy the class assignments m_classToCluster = new int[m_numClusters]; for (i = 0; i < m_numClusters; i++) { m_classToCluster[i] = (int) best[i]; } }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
/** * Evaluates a clusterer with the options given in an array of * strings. It takes the string indicated by "-t" as training file, the * string indicated by "-T" as test file. * If the test file is missing, a stratified ten-fold * cross-validation is performed (distribution clusterers only). * Using "-x" you can change the number of * folds to be used, and using "-s" the random seed. * If the "-p" option is present it outputs the classification for * each test instance. If you provide the name of an object file using * "-l", a clusterer will be loaded from the given file. If you provide the * name of an object file using "-d", the clusterer built from the * training data will be saved to the given file. * * @param clusterer machine learning clusterer * @param options the array of string containing the options * @throws Exception if model could not be evaluated successfully * @return a string describing the results *///from w w w .j a v a 2s . c o m public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception { int seed = 1, folds = 10; boolean doXval = false; Instances train = null; Random random; String trainFileName, testFileName, seedString, foldsString; String objectInputFileName, objectOutputFileName, attributeRangeString; String graphFileName; String[] savedOptions = null; boolean printClusterAssignments = false; Range attributesToOutput = null; StringBuffer text = new StringBuffer(); int theClass = -1; // class based evaluation of clustering boolean updateable = (clusterer instanceof UpdateableClusterer); DataSource source = null; Instance inst; if (Utils.getFlag('h', options) || Utils.getFlag("help", options)) { // global info requested as well? boolean globalInfo = Utils.getFlag("synopsis", options) || Utils.getFlag("info", options); throw new Exception("Help requested." + makeOptionString(clusterer, globalInfo)); } try { // Get basic options (options the same for all clusterers //printClusterAssignments = Utils.getFlag('p', options); objectInputFileName = Utils.getOption('l', options); objectOutputFileName = Utils.getOption('d', options); trainFileName = Utils.getOption('t', options); testFileName = Utils.getOption('T', options); graphFileName = Utils.getOption('g', options); // Check -p option try { attributeRangeString = Utils.getOption('p', options); } catch (Exception e) { throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. " + "It now expects a parameter specifying a range of attributes " + "to list with the predictions. Use '-p 0' for none."); } if (attributeRangeString.length() != 0) { printClusterAssignments = true; if (!attributeRangeString.equals("0")) attributesToOutput = new Range(attributeRangeString); } if (trainFileName.length() == 0) { if (objectInputFileName.length() == 0) { throw new Exception("No training file and no object " + "input file given."); } if (testFileName.length() == 0) { throw new Exception("No training file and no test file given."); } } else { if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) { throw new Exception("Can't use both train and model file " + "unless -p specified."); } } seedString = Utils.getOption('s', options); if (seedString.length() != 0) { seed = Integer.parseInt(seedString); } foldsString = Utils.getOption('x', options); if (foldsString.length() != 0) { folds = Integer.parseInt(foldsString); doXval = true; } } catch (Exception e) { throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer, false)); } try { if (trainFileName.length() != 0) { source = new DataSource(trainFileName); train = source.getStructure(); String classString = Utils.getOption('c', options); if (classString.length() != 0) { if (classString.compareTo("last") == 0) theClass = train.numAttributes(); else if (classString.compareTo("first") == 0) theClass = 1; else theClass = Integer.parseInt(classString); if (theClass != -1) { if (doXval || testFileName.length() != 0) throw new Exception("Can only do class based evaluation on the " + "training data"); if (objectInputFileName.length() != 0) throw new Exception("Can't load a clusterer and do class based " + "evaluation"); if (objectOutputFileName.length() != 0) throw new Exception("Can't do class based evaluation and save clusterer"); } } else { // if the dataset defines a class attribute, use it if (train.classIndex() != -1) { theClass = train.classIndex() + 1; System.err .println("Note: using class attribute from dataset, i.e., attribute #" + theClass); } } if (theClass != -1) { if (theClass < 1 || theClass > train.numAttributes()) throw new Exception("Class is out of range!"); if (!train.attribute(theClass - 1).isNominal()) throw new Exception("Class must be nominal!"); train.setClassIndex(theClass - 1); } } } catch (Exception e) { throw new Exception("ClusterEvaluation: " + e.getMessage() + '.'); } // Save options if (options != null) { savedOptions = new String[options.length]; System.arraycopy(options, 0, savedOptions, 0, options.length); } if (objectInputFileName.length() != 0) Utils.checkForRemainingOptions(options); // Set options for clusterer if (clusterer instanceof OptionHandler) ((OptionHandler) clusterer).setOptions(options); Utils.checkForRemainingOptions(options); Instances trainHeader = train; if (objectInputFileName.length() != 0) { // Load the clusterer from file // clusterer = (Clusterer) SerializationHelper.read(objectInputFileName); java.io.ObjectInputStream ois = new java.io.ObjectInputStream( new java.io.BufferedInputStream(new java.io.FileInputStream(objectInputFileName))); clusterer = (Clusterer) ois.readObject(); // try and get the training header try { trainHeader = (Instances) ois.readObject(); } catch (Exception ex) { // don't moan if we cant } } else { // Build the clusterer if no object file provided if (theClass == -1) { if (updateable) { clusterer.buildClusterer(source.getStructure()); while (source.hasMoreElements(train)) { inst = source.nextElement(train); ((UpdateableClusterer) clusterer).updateClusterer(inst); } ((UpdateableClusterer) clusterer).updateFinished(); } else { clusterer.buildClusterer(source.getDataSet()); } } else { Remove removeClass = new Remove(); removeClass.setAttributeIndices("" + theClass); removeClass.setInvertSelection(false); removeClass.setInputFormat(train); if (updateable) { Instances clusterTrain = Filter.useFilter(train, removeClass); clusterer.buildClusterer(clusterTrain); trainHeader = clusterTrain; while (source.hasMoreElements(train)) { inst = source.nextElement(train); removeClass.input(inst); removeClass.batchFinished(); Instance clusterTrainInst = removeClass.output(); ((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst); } ((UpdateableClusterer) clusterer).updateFinished(); } else { Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass); clusterer.buildClusterer(clusterTrain); trainHeader = clusterTrain; } ClusterEvaluationEX ce = new ClusterEvaluationEX(); ce.setClusterer(clusterer); ce.evaluateClusterer(train, trainFileName); return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString(); } } /* Output cluster predictions only (for the test data if specified, otherwise for the training data */ if (printClusterAssignments) { return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput); } text.append(clusterer.toString()); text.append( "\n\n=== Clustering stats for training data ===\n\n" + printClusterStats(clusterer, trainFileName)); if (testFileName.length() != 0) { // check header compatibility DataSource test = new DataSource(testFileName); Instances testStructure = test.getStructure(); if (!trainHeader.equalHeaders(testStructure)) { throw new Exception("Training and testing data are not compatible\n"); } text.append("\n\n=== Clustering stats for testing data ===\n\n" + printClusterStats(clusterer, testFileName)); } if ((clusterer instanceof DensityBasedClusterer) && (doXval == true) && (testFileName.length() == 0) && (objectInputFileName.length() == 0)) { // cross validate the log likelihood on the training data random = new Random(seed); random.setSeed(seed); train = source.getDataSet(); train.randomize(random); text.append(crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random)); } // Save the clusterer if an object output file is provided if (objectOutputFileName.length() != 0) { //SerializationHelper.write(objectOutputFileName, clusterer); saveClusterer(objectOutputFileName, clusterer, trainHeader); } // If classifier is drawable output string describing graph if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) { BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName)); writer.write(((Drawable) clusterer).graph()); writer.newLine(); writer.flush(); writer.close(); } return text.toString(); }
From source file:core.TextDirectoryLoader.java
License:Open Source License
/** * Return the full data set. If the structure hasn't yet been determined by a * call to getStructure then method should do so before processing the rest of * the data set.//from www . ja v a2 s . c om * * @return the structure of the data set as an empty set of Instances * @throws IOException if there is no source or parsing fails */ @Override public Instances getDataSet() throws IOException { if (getDirectory() == null) { throw new IOException("No directory/source has been specified"); } String directoryPath = getDirectory().getAbsolutePath(); ArrayList<String> classes = new ArrayList<String>(); Enumeration<Object> enm = getStructure().classAttribute().enumerateValues(); while (enm.hasMoreElements()) { Object oo = enm.nextElement(); if (oo instanceof SerializedObject) { classes.add(((SerializedObject) oo).getObject().toString()); } else { classes.add(oo.toString()); } } Instances data = getStructure(); int fileCount = 0; for (int k = 0; k < classes.size(); k++) { String subdirPath = classes.get(k); File subdir = new File(directoryPath + File.separator + subdirPath); String[] files = subdir.list(); for (String file : files) { try { fileCount++; if (getDebug()) { System.err.println("processing " + fileCount + " : " + subdirPath + " : " + file); } double[] newInst = null; if (m_OutputFilename) { newInst = new double[3]; } else { newInst = new double[2]; } File txt = new File(directoryPath + File.separator + subdirPath + File.separator + file); BufferedReader is; if (m_charSet == null || m_charSet.length() == 0) { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt))); } else { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet)); } StringBuffer txtStr = new StringBuffer(); /*int c; while ((c = is.read()) != -1) { txtStr.append((char) c); }*/ FileReader fr = new FileReader(txt); BufferedReader br = new BufferedReader(fr); String line; while ((line = br.readLine()) != null) { txtStr.append(line + System.getProperty("line.separator")); } newInst[0] = data.attribute(0).addStringValue(txtStr.toString()); if (m_OutputFilename) { newInst[1] = data.attribute(1).addStringValue(subdirPath + File.separator + file); } newInst[data.classIndex()] = k; data.add(new DenseInstance(1.0, newInst)); is.close(); } catch (Exception e) { System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath + File.separator + file); } } } return data; }
From source file:core.TextDirectoryLoader.java
License:Open Source License
/** * Process input directories/files incrementally. * * @param structure ignored// w w w .j ava 2 s . com * @return never returns without throwing an exception * @throws IOException if a problem occurs */ @Override public Instance getNextInstance(Instances structure) throws IOException { // throw new // IOException("TextDirectoryLoader can't read data sets incrementally."); String directoryPath = getDirectory().getAbsolutePath(); Attribute classAtt = structure.classAttribute(); if (m_filesByClass == null) { m_filesByClass = new ArrayList<LinkedList<String>>(); for (int i = 0; i < classAtt.numValues(); i++) { File classDir = new File(directoryPath + File.separator + classAtt.value(i)); String[] files = classDir.list(); LinkedList<String> classDocs = new LinkedList<String>(); for (String cd : files) { File txt = new File(directoryPath + File.separator + classAtt.value(i) + File.separator + cd); if (txt.isFile()) { classDocs.add(cd); } } m_filesByClass.add(classDocs); } } // cycle through the classes int count = 0; LinkedList<String> classContents = m_filesByClass.get(m_lastClassDir); boolean found = (classContents.size() > 0); while (classContents.size() == 0) { m_lastClassDir++; count++; if (m_lastClassDir == structure.classAttribute().numValues()) { m_lastClassDir = 0; } classContents = m_filesByClass.get(m_lastClassDir); if (classContents.size() > 0) { found = true; // we have an instance we can create break; } if (count == structure.classAttribute().numValues()) { break; // must be finished } } if (found) { String nextDoc = classContents.poll(); File txt = new File( directoryPath + File.separator + classAtt.value(m_lastClassDir) + File.separator + nextDoc); BufferedReader is; if (m_charSet == null || m_charSet.length() == 0) { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt))); } else { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } double[] newInst = null; if (m_OutputFilename) { newInst = new double[3]; } else { newInst = new double[2]; } newInst[0] = 0; structure.attribute(0).setStringValue(txtStr.toString()); if (m_OutputFilename) { newInst[1] = 0; structure.attribute(1).setStringValue(txt.getAbsolutePath()); } newInst[structure.classIndex()] = m_lastClassDir; Instance inst = new DenseInstance(1.0, newInst); inst.setDataset(structure); is.close(); m_lastClassDir++; if (m_lastClassDir == structure.classAttribute().numValues()) { m_lastClassDir = 0; } return inst; } else { return null; // done! } }