List of usage examples for weka.core Instances add
@Override public boolean add(Instance instance)
From source file:guineu.modules.dataanalysis.clustering.ClusteringTask.java
License:Open Source License
/** * Creates the weka data set for clustering of samples * @param rawData Data extracted from selected Raw data files and rows. * @return Weka library data set/* ww w . j ava2 s .c o m*/ */ private Instances createSampleWekaDataset(double[][] rawData) { FastVector attributes = new FastVector(); for (int i = 0; i < rawData[0].length; i++) { String varName = "Var" + i; Attribute var = new Attribute(varName); attributes.addElement(var); } if (clusteringAlgorithm.toString().equals("Hierarchical Clusterer")) { Attribute name = new Attribute("name", (FastVector) null); attributes.addElement(name); } Instances data = new Instances("Dataset", attributes, 0); for (int i = 0; i < rawData.length; i++) { double[] values = new double[data.numAttributes()]; System.arraycopy(rawData[i], 0, values, 0, rawData[0].length); if (clusteringAlgorithm.toString().equals("Hierarchical Clusterer")) { values[data.numAttributes() - 1] = data.attribute("name") .addStringValue("\"" + this.selectedRawDataFiles[i] + "\""); } Instance inst = new SparseInstance(1.0, values); data.add(inst); } return data; }
From source file:guineu.modules.dataanalysis.clustering.ClusteringTask.java
License:Open Source License
/** * Creates the weka data set for clustering of variables (metabolites) * @param rawData Data extracted from selected Raw data files and rows. * @return Weka library data set// www .j a va2 s. com */ private Instances createVariableWekaDataset(double[][] rawData) { FastVector attributes = new FastVector(); for (int i = 0; i < this.selectedRawDataFiles.length; i++) { String varName = "Var" + i; Attribute var = new Attribute(varName); attributes.addElement(var); } if (clusteringAlgorithm.toString().equals("Hierarchical Clusterer")) { Attribute name = new Attribute("name", (FastVector) null); attributes.addElement(name); } Instances data = new Instances("Dataset", attributes, 0); for (int i = 0; i < selectedRows.length; i++) { double[] values = new double[data.numAttributes()]; System.arraycopy(rawData[i], 0, values, 0, rawData[0].length); if (clusteringAlgorithm.toString().equals("Hierarchical Clusterer")) { String rowName = selectedRows[i].getName(); values[data.numAttributes() - 1] = data.attribute("name").addStringValue(rowName); } Instance inst = new SparseInstance(1.0, values); data.add(inst); } return data; }
From source file:guineu.util.WekaUtils.java
License:Open Source License
public static Instances getWekaDataset(Dataset dataset, boolean samples) { try {//from w w w .jav a 2s . c o m if (samples) { FastVector attributes = new FastVector(); int cont = 1; for (PeakListRow row : dataset.getRows()) { String rowName = ""; rowName = row.getName(); if (rowName == null || rowName.isEmpty()) { rowName = "Var"; } rowName += cont++; Attribute var = new Attribute(rowName); attributes.addElement(var); } //Creates the dataset Instances data = new Instances(dataset.getDatasetName(), attributes, 0); for (int i = 0; i < dataset.getNumberCols(); i++) { String sampleName = dataset.getAllColumnNames().get(i); double[] values = new double[data.numAttributes()]; cont = 0; for (PeakListRow row : dataset.getRows()) { values[cont++] = (Double) row.getPeak(sampleName); } Instance inst = new SparseInstance(1.0, values); data.add(inst); } return data; } else { FastVector attributes = new FastVector(); int cont = 1; for (String column : dataset.getAllColumnNames()) { Attribute var = new Attribute(column); attributes.addElement(var); } //Creates the dataset Instances data = new Instances(dataset.getDatasetName(), attributes, 0); for (PeakListRow row : dataset.getRows()) { double[] values = new double[data.numAttributes()]; cont = 0; for (int i = 0; i < dataset.getNumberCols(); i++) { String sampleName = dataset.getAllColumnNames().get(i); values[cont++] = (Double) row.getPeak(sampleName); } Instance inst = new SparseInstance(1.0, values); data.add(inst); } return data; } } catch (Exception ex) { ex.printStackTrace(); return null; } }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * Select only instances with weights that contribute to * the specified quantile of the weight distribution * * @param data the input instances/*from ww w. ja v a 2 s . c om*/ * @param quantile the specified quantile eg 0.9 to select * 90% of the weight mass * @return the selected instances */ protected Instances selectWeightQuantile(Instances data, double quantile) { int numInstances = data.numInstances(); Instances trainData = new Instances(data, numInstances); double[] weights = new double[numInstances]; double sumOfWeights = 0; for (int i = 0; i < numInstances; i++) { weights[i] = data.instance(i).weight(); sumOfWeights += weights[i]; } double weightMassToSelect = sumOfWeights * quantile; int[] sortedIndices = Utils.sort(weights); // Select the instances sumOfWeights = 0; for (int i = numInstances - 1; i >= 0; i--) { Instance instance = (Instance) data.instance(sortedIndices[i]).copy(); trainData.add(instance); sumOfWeights += weights[sortedIndices[i]]; if ((sumOfWeights > weightMassToSelect) && (i > 0) && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) { break; } } if (m_Debug) { System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances); } return trainData; }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * //from ww w. j a v a 2s . com * nMajnMin * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int nMaj, int nMin, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // the new data-set contains samples_min + samples_min * N / 100 int size = nMaj + nMin; //selected = new int[size]; // we store the selected examples indexes String majClassName = copia.attribute(copia.classIndex()).value(majC); Instances myDataset = new Instances(copia, 0); int nData = 0; for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } if (minCount <= 0) return copia; /* random undersampling of the majority */ //boolean[] taken = new boolean[copia.numInstances()]; int r; if (nMaj == majCount) { //System.out.println("#equal"); for (int i = 0; i < nMaj; i++) { myDataset.add(copia.instance(majExamples[i])); } } else { for (int i = 0; i < nMaj; i++) { r = simplingRandom.nextInt(majCount); //selected[nData] = majExamples[r]; myDataset.add(copia.instance(majExamples[r])); //taken[majExamples[r]] = true; } } for (int i = 0; i < nMin; i++) { r = simplingRandom.nextInt(minCount); //System.out.print("_"+r); //selected[nData] = minExamples[r]; myDataset.add(copia.instance(minExamples[r])); //taken[minExamples[r]] = true; } //System.out.println(); //System.out.println("minC="+minCount+"; majC="+majCount); myDataset.randomize(simplingRandom); return myDataset; }
From source file:gyc.SMOTEBagging.java
License:Open Source License
/** * Creates a new dataset of the same size using random sampling * with replacement according to the given weight vector. The * weights of the instances in the new dataset are set to one. * The length of the weight vector has to be the same as the * number of instances in the dataset, and all weights have to * be positive./* w ww . ja va 2 s .co m*/ * * @param data the data to be sampled from * @param random a random number generator * @param sampled indicating which instance has been sampled * @return the new dataset * @throws IllegalArgumentException if the weights array is of the wrong * length or contains negative weights. */ public final Instances resampleWithWeights(Instances data, Random random, boolean[] sampled) { double[] weights = new double[data.numInstances()]; for (int i = 0; i < weights.length; i++) { weights[i] = data.instance(i).weight(); } Instances newData = new Instances(data, data.numInstances()); if (data.numInstances() == 0) { return newData; } double[] probabilities = new double[data.numInstances()]; double sumProbs = 0, sumOfWeights = Utils.sum(weights); for (int i = 0; i < data.numInstances(); i++) { sumProbs += random.nextDouble(); probabilities[i] = sumProbs; } Utils.normalize(probabilities, sumProbs / sumOfWeights); // Make sure that rounding errors don't mess things up probabilities[data.numInstances() - 1] = sumOfWeights; int k = 0; int l = 0; sumProbs = 0; while ((k < data.numInstances() && (l < data.numInstances()))) { if (weights[l] < 0) { throw new IllegalArgumentException("Weights have to be positive."); } sumProbs += weights[l]; while ((k < data.numInstances()) && (probabilities[k] <= sumProbs)) { newData.add(data.instance(l)); sampled[l] = true; newData.instance(k).setWeight(1); k++; } l++; } return newData; }
From source file:gyc.SMOTEBagging.java
License:Open Source License
/** * /* w w w .jav a2s . com*/ * 100%majminSMOTE (k, a). * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // resample min at rate (Nmaj/Nmin)*a% int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100; // class name String majClassName = copia.attribute(copia.classIndex()).value(majC); for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } /* random undersampling of the majority */ Instances myDataset = new Instances(copia, 0); int r; //100%majC for (int i = 0; i < majCount; i++) { myDataset.add(copia.instance(majExamples[i])); } if (minCount == 0) return myDataset; //(Nmaj/Nmin)*a% minC for (int i = 0; i < size; i++) { r = simplingRandom.nextInt(minCount); myDataset.add(copia.instance(minExamples[r])); } myDataset.randomize(simplingRandom); if (size == 1) { try { //neighbor Resample filter = new Resample(); filter.setInputFormat(myDataset); filter.setBiasToUniformClass(1.0); filter.setRandomSeed(simplingRandom.nextInt()); myDataset = Filter.useFilter(myDataset, filter); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (size > 1) { try { SMOTE filter = new SMOTE(); filter.setInputFormat(myDataset); // filter capabilities are checked here //data. double value = 100.0 * majCount / size - 100; //Percentage filter.setPercentage(value); //if (nMin<5) filter.setNearestNeighbors(nMin); filter.setRandomSeed(simplingRandom.nextInt()); //filterSMOTESMOTE myDataset = Filter.useFilter(myDataset, filter); //t.stop(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } return myDataset; }
From source file:gyc.UnderOverBoostM1.java
License:Open Source License
/** * /*from ww w . ja v a 2 s .com*/ * nMajnMin * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // the new data-set contains samples_min + samples_min * N / 100 int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100 * 2; // class name String majClassName = copia.attribute(copia.classIndex()).value(majC); for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } /* random undersampling of the majority */ Instances myDataset = new Instances(copia, 0); int r; for (int i = 0; i < size / 2; i++) { r = simplingRandom.nextInt(majCount); myDataset.add(copia.instance(majExamples[r])); if (minCount > 0) { r = simplingRandom.nextInt(minCount); myDataset.add(copia.instance(minExamples[r])); } } myDataset.randomize(simplingRandom); return myDataset; }
From source file:hsa_jni.hsa_jni.EvaluatePeriodicHeldOutTestBatch.java
License:Open Source License
@Override protected Object doMainTask(TaskMonitor monitor, ObjectRepository repository) { Classifier learner = (Classifier) getPreparedClassOption(this.learnerOption); InstanceStream stream = (InstanceStream) getPreparedClassOption(this.streamOption); ClassificationPerformanceEvaluator evaluator = (ClassificationPerformanceEvaluator) getPreparedClassOption( this.evaluatorOption); learner.setModelContext(stream.getHeader()); long instancesProcessed = 0; LearningCurve learningCurve = new LearningCurve("evaluation instances"); File dumpFile = this.dumpFileOption.getFile(); PrintStream immediateResultStream = null; if (dumpFile != null) { try {//from w w w .j a v a2s. c om if (dumpFile.exists()) { immediateResultStream = new PrintStream(new FileOutputStream(dumpFile, true), true); } else { immediateResultStream = new PrintStream(new FileOutputStream(dumpFile), true); } } catch (Exception ex) { throw new RuntimeException("Unable to open immediate result file: " + dumpFile, ex); } } boolean firstDump = true; InstanceStream testStream = null; int testSize = this.testSizeOption.getValue(); if (this.cacheTestOption.isSet()) { monitor.setCurrentActivity("Caching test examples...", -1.0); Instances testInstances = new Instances(stream.getHeader(), this.testSizeOption.getValue()); while (testInstances.numInstances() < testSize) { testInstances.add(stream.nextInstance()); if (testInstances.numInstances() % INSTANCES_BETWEEN_MONITOR_UPDATES == 0) { if (monitor.taskShouldAbort()) { return null; } monitor.setCurrentActivityFractionComplete( (double) testInstances.numInstances() / (double) (this.testSizeOption.getValue())); } } testStream = new CachedInstancesStream(testInstances); } else { //testStream = (InstanceStream) stream.copy(); testStream = stream; /*monitor.setCurrentActivity("Skipping test examples...", -1.0); for (int i = 0; i < testSize; i++) { stream.nextInstance(); }*/ } instancesProcessed = 0; TimingUtils.enablePreciseTiming(); double totalTrainTime = 0.0; while ((this.trainSizeOption.getValue() < 1 || instancesProcessed < this.trainSizeOption.getValue()) && stream.hasMoreInstances() == true) { monitor.setCurrentActivityDescription("Training..."); long instancesTarget = instancesProcessed + this.sampleFrequencyOption.getValue(); ArrayList<Instance> instanceCache = new ArrayList<Instance>(); long trainStartTime = TimingUtils.getNanoCPUTimeOfCurrentThread(); double lastTrainTime = 0; while (instancesProcessed < instancesTarget && stream.hasMoreInstances() == true) { instanceCache.add(stream.nextInstance()); instancesProcessed++; if (instancesProcessed % INSTANCES_BETWEEN_MONITOR_UPDATES == 0) { if (monitor.taskShouldAbort()) { return null; } monitor.setCurrentActivityFractionComplete( (double) (instancesProcessed) / (double) (this.trainSizeOption.getValue())); } if (instanceCache.size() % 1000 == 0) { trainStartTime = TimingUtils.getNanoCPUTimeOfCurrentThread(); for (Instance inst : instanceCache) { learner.trainOnInstance(inst); } lastTrainTime += TimingUtils .nanoTimeToSeconds(TimingUtils.getNanoCPUTimeOfCurrentThread() - trainStartTime); instanceCache.clear(); } } trainStartTime = TimingUtils.getNanoCPUTimeOfCurrentThread(); for (Instance inst : instanceCache) { learner.trainOnInstance(inst); } if (learner instanceof BatchClassifier) ((BatchClassifier) learner).commit(); lastTrainTime += TimingUtils .nanoTimeToSeconds(TimingUtils.getNanoCPUTimeOfCurrentThread() - trainStartTime); totalTrainTime += lastTrainTime; if (totalTrainTime > this.trainTimeOption.getValue()) { break; } if (this.cacheTestOption.isSet()) { testStream.restart(); } evaluator.reset(); long testInstancesProcessed = 0; monitor.setCurrentActivityDescription("Testing (after " + StringUtils.doubleToString( ((double) (instancesProcessed) / (double) (this.trainSizeOption.getValue()) * 100.0), 2) + "% training)..."); long testStartTime = TimingUtils.getNanoCPUTimeOfCurrentThread(); int instCount = 0; for (instCount = 0; instCount < testSize; instCount++) { if (stream.hasMoreInstances() == false) { break; } Instance testInst = (Instance) testStream.nextInstance().copy(); double trueClass = testInst.classValue(); testInst.setClassMissing(); double[] prediction = learner.getVotesForInstance(testInst); testInst.setClassValue(trueClass); evaluator.addResult(testInst, prediction); testInstancesProcessed++; if (testInstancesProcessed % INSTANCES_BETWEEN_MONITOR_UPDATES == 0) { if (monitor.taskShouldAbort()) { return null; } monitor.setCurrentActivityFractionComplete( (double) testInstancesProcessed / (double) (testSize)); } } if (instCount != testSize) { break; } double testTime = TimingUtils .nanoTimeToSeconds(TimingUtils.getNanoCPUTimeOfCurrentThread() - testStartTime); List<Measurement> measurements = new ArrayList<Measurement>(); measurements.add(new Measurement("evaluation instances", instancesProcessed)); measurements.add(new Measurement("total train time", totalTrainTime)); measurements.add(new Measurement("total train speed", instancesProcessed / totalTrainTime)); measurements.add(new Measurement("last train time", lastTrainTime)); measurements.add( new Measurement("last train speed", this.sampleFrequencyOption.getValue() / lastTrainTime)); measurements.add(new Measurement("test time", testTime)); measurements.add(new Measurement("test speed", this.testSizeOption.getValue() / testTime)); Measurement[] performanceMeasurements = evaluator.getPerformanceMeasurements(); for (Measurement measurement : performanceMeasurements) { measurements.add(measurement); } Measurement[] modelMeasurements = learner.getModelMeasurements(); for (Measurement measurement : modelMeasurements) { measurements.add(measurement); } learningCurve.insertEntry( new LearningEvaluation(measurements.toArray(new Measurement[measurements.size()]))); if (immediateResultStream != null) { if (firstDump) { immediateResultStream.println(learningCurve.headerToString()); firstDump = false; } immediateResultStream.println(learningCurve.entryToString(learningCurve.numEntries() - 1)); immediateResultStream.flush(); } if (monitor.resultPreviewRequested()) { monitor.setLatestResultPreview(learningCurve.copy()); } // if (learner instanceof HoeffdingTree // || learner instanceof HoeffdingOptionTree) { // int numActiveNodes = (int) Measurement.getMeasurementNamed( // "active learning leaves", // modelMeasurements).getValue(); // // exit if tree frozen // if (numActiveNodes < 1) { // break; // } // int numNodes = (int) Measurement.getMeasurementNamed( // "tree size (nodes)", modelMeasurements) // .getValue(); // if (numNodes == lastNumNodes) { // noGrowthCount++; // } else { // noGrowthCount = 0; // } // lastNumNodes = numNodes; // } else if (learner instanceof OzaBoost || learner instanceof // OzaBag) { // double numActiveNodes = Measurement.getMeasurementNamed( // "[avg] active learning leaves", // modelMeasurements).getValue(); // // exit if all trees frozen // if (numActiveNodes == 0.0) { // break; // } // int numNodes = (int) (Measurement.getMeasurementNamed( // "[avg] tree size (nodes)", // learner.getModelMeasurements()).getValue() * Measurement // .getMeasurementNamed("ensemble size", // modelMeasurements).getValue()); // if (numNodes == lastNumNodes) { // noGrowthCount++; // } else { // noGrowthCount = 0; // } // lastNumNodes = numNodes; // } } if (immediateResultStream != null) { immediateResultStream.close(); } return learningCurve; }
From source file:imba.classifier.NBTubes.java
@Override public double[] distributionForInstance(Instance instance) throws Exception { //Fungsi ini menentukan probabilitas setiap kelas instance untuk instance //yang ada di parameter fungsi Instances temp = null; Instance p;/*from w ww. j a v a2s . c o m*/ Filter f; double[] a = new double[infoClassifier.get(0).get(0).size()]; int i, j, k, l, x, c; double t, prev; Enumeration n; boolean big; String val; String[] valMinMax; if (wasNumeric) { header_Instances.add(instance); f = new Normalize(); try { f.setInputFormat(header_Instances); for (Instance i1 : header_Instances) { f.input(i1); } f.batchFinished(); } catch (Exception ex) { Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex); } temp = f.getOutputFormat(); while ((p = f.output()) != null) { temp.add(p); } } f = new NumericToNominal(); if (wasNumeric) { try { f.setInputFormat(temp); for (Instance i1 : temp) { f.input(i1); } f.batchFinished(); } catch (Exception ex) { Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex); } temp = null; temp = f.getOutputFormat(); p = null; while ((p = f.output()) != null) { temp.add(p); } instance = temp.lastInstance(); header_Instances.remove(header_Instances.size() - 1); } else { f.setInputFormat(header_Instances); f.input(instance); f.batchFinished(); instance = f.output(); } //Itung distribusi instance utk tiap kelas i = 0; while (i < (a.length)) { a[i] = (double) sumClass[i] / dataSize; j = 0; k = 0; while (j < infoClassifier.size()) { if (j == classIdx) { k++; } if (wasNumeric) { if (filter.equals("Discretize")) { l = 0; big = false; while (l < dataset.attribute(k).numValues() && big == false) { //parse val = String.valueOf(dataset.attribute(k).value(l)); //System.out.println("k = " + k); //System.out.println("nilai = " + instance.stringValue(k)); val = val.replaceAll("'", ""); val = val.replaceAll("\\(", ""); val = val.replaceAll("\\)", ""); val = val.replaceAll("]", ""); //System.out.println(val); valMinMax = val.split("-"); //cocokin if (valMinMax.length == 3) { if (valMinMax[1].equals("inf")) { valMinMax[1] = "0.0"; } //System.out.println("Min = " + valMinMax[1]); //System.out.println("Max = " + valMinMax[2]); if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[1]) && Double .valueOf(instance.stringValue(k)) <= Double.valueOf(valMinMax[2])) { big = true; } } else { if (valMinMax.length == 2) { if (valMinMax[1].equals("inf")) { valMinMax[1] = "1.0"; } if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[0]) && Double.valueOf(instance.stringValue(k)) <= Double .valueOf(valMinMax[1])) { big = true; } } else { l = dataset.attribute(k).indexOfValue(instance.stringValue(k)); big = true; } //System.out.println("Min = " + valMinMax[0]); //System.out.println("Max = " + valMinMax[1]); } l++; } x = l - 1; //System.out.println("x = " + x); } else { big = false; l = 0; n = dataset.attribute(k).enumerateValues(); t = 0; prev = 0; while (l < dataset.attribute(k).numValues() && big == false) { t = Double.valueOf(n.nextElement().toString()); //System.out.println(prev + " " + t); if (Double.valueOf(instance.stringValue(k)) <= t) { big = true; } else { prev = t; } l++; } if (big == true && t != Double.valueOf(instance.stringValue(k))) { System.out.println(prev + " " + Double.valueOf(instance.stringValue(k)) + " " + t); } //x = l - 1; if (classIdx < 2) { c = 2; } else { c = 1; } if (big == true && l > c) { if ((Double.valueOf(instance.stringValue(k)) - prev) <= (t - Double.valueOf(instance.stringValue(k)))) { x = l - 2; } else { x = l - 1; } } else { x = l - 1; } } } else { x = dataset.attribute(k).indexOfValue(instance.stringValue(k)); } a[i] *= infoClassifier.get(j).get(x).get(i); k++; j++; } i++; } return a; }