List of usage examples for weka.core Instances size
@Override publicint size()
From source file:de.ugoe.cs.cpdp.dataselection.LACE2.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { Instances selectedData = new Instances(testdata); selectedData.clear();/*from ww w .j a v a2s . c o m*/ LinkedList<Instances> traindataCopy = new LinkedList<>(traindataSet); Collections.shuffle(traindataCopy); CLIFF cliff = new CLIFF(); cliff.setParameter(Double.toString(percentage)); MORPH morph = new MORPH(); Median median = new Median(); double minDist = Double.MIN_VALUE; for (Instances traindata : traindataCopy) { Instances cliffedData = cliff.applyCLIFF(traindata); if (minDist == Double.MIN_VALUE) { // determine distance for leader-follower algorithm Instances sample; if (traindata.size() > 100) { Resample resample = new Resample(); resample.setSampleSizePercent(100.0 / traindata.size() * 100.0); resample.setBiasToUniformClass(0.0); resample.setNoReplacement(true); try { resample.setInputFormat(traindata); sample = Filter.useFilter(traindata, resample); } catch (Exception e) { throw new RuntimeException(e); } } else { sample = new Instances(traindata); } double[] distances = new double[sample.size()]; for (int i = 0; i < sample.size(); i++) { Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(sample.get(i), sample); distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); } minDist = median.evaluate(distances); } for (int i = 0; i < cliffedData.size(); i++) { Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData); if (unlikeNeighbor == null) { selectedData.add(cliffedData.get(i)); } else { double distance = MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); if (distance > minDist) { morph.morphInstance(cliffedData.get(i), cliffedData); selectedData.add(cliffedData.get(i)); } } } } }
From source file:de.ugoe.cs.cpdp.dataselection.MahalanobisOutlierRemoval.java
License:Apache License
/** * <p>//from w w w . j a v a2 s. co m * removes all instances, whose Mahalanobi distance to the mean of the data is greater than * epsilon. * </p> * * @param data * data where the outliers are removed */ private void applyMahalanobisDistancesRemoval(Instances data) { RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1); for (int i = 0; i < data.size(); i++) { values.setRow(i, WekaUtils.instanceValues(data.get(i))); } RealMatrix inverseCovariance; try { inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver() .getInverse(); } catch (SingularMatrixException e) { Console.traceln(Level.WARNING, "could not perform Mahalanobis outlier removal due to singular covariance matrix"); return; } // create mean vector double[] meanValues = new double[data.numAttributes() - 1]; int k = 0; for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { meanValues[k] = data.attributeStats(j).numericStats.mean; k++; } } for (int i = data.size() - 1; i >= 0; i--) { double distance = mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)), meanValues); if (distance > epsilon) { data.remove(i); } } }
From source file:de.ugoe.cs.cpdp.dataselection.NeighborhoodFilter.java
License:Apache License
/** * <p>// ww w. j a v a 2 s .co m * Applies the relevancy filter after Ryu et al. * </p> * * @param testdata * test data * @param traindata * training data * @return filtered trainind data */ private Instances applyNeighborhoodFilter(Instances testdata, Instances traindata) { TreeSet<Integer> selectedInstances = new TreeSet<>(); for (int i = 0; i < testdata.size(); i++) { double minHam = Double.MAX_VALUE; for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance < minHam) { minHam = distance; } } for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance <= minHam) { selectedInstances.add(j); } } } Instances selectedTraindata = new Instances(testdata); selectedTraindata.clear(); for (Integer index : selectedInstances) { selectedTraindata.add(traindata.instance(index)); } return selectedTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java
License:Apache License
/** * <p>//from w w w . j a v a 2 s . c o m * Applies the synonym outlier removal. * </p> * * @param traindata * data from which the outliers are removed. */ public void applySynonymRemoval(Instances traindata) { double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1]; double minDistanceAttribute[] = new double[traindata.numAttributes() - 1]; double distance; for (int j = 0; j < minDistanceAttribute.length; j++) { minDistanceAttribute[j] = Double.MAX_VALUE; } for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { minDistance[i1][k] = Double.MAX_VALUE; for (int i2 = 0; i2 < traindata.size(); i2++) { if (i1 != i2) { distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j)); if (distance < minDistance[i1][k]) { minDistance[i1][k] = distance; } if (distance < minDistanceAttribute[k]) { minDistanceAttribute[k] = distance; } } } k++; } } } for (int i = traindata.size() - 1; i >= 0; i--) { boolean hasClosest = false; for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) { hasClosest = minDistance[i][j] <= minDistanceAttribute[j]; } if (!hasClosest) { traindata.delete(i); } } }
From source file:de.ugoe.cs.cpdp.execution.WithinProjectOrderedSplitExperiment.java
License:Apache License
/** * Executes the experiment with the steps as described in the class comment. * /*from w w w . jav a 2s.c om*/ * @see Runnable#run() */ @Override public void run() { final List<SoftwareVersion> versions = new LinkedList<>(); for (IVersionLoader loader : config.getLoaders()) { versions.addAll(loader.load()); } for (IVersionFilter filter : config.getVersionFilters()) { filter.apply(versions); } boolean writeHeader = true; int versionCount = 1; int testVersionCount = 0; int numTrainers = 0; for (SoftwareVersion testVersion : versions) { if (isVersion(testVersion, config.getTestVersionFilters())) { testVersionCount++; } } numTrainers += config.getSetWiseTrainers().size(); numTrainers += config.getSetWiseTestdataAwareTrainers().size(); numTrainers += config.getTrainers().size(); numTrainers += config.getTestAwareTrainers().size(); // sort versions Collections.sort(versions); for (SoftwareVersion testVersion : versions) { if (isVersion(testVersion, config.getTestVersionFilters())) { Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); int numResultsAvailable = resultsAvailable(testVersion); if (numResultsAvailable >= numTrainers * config.getRepetitions()) { Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: results already available; skipped", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); versionCount++; continue; } // Setup testdata and training data Instances testdata = testVersion.getInstances(); List<Double> efforts = testVersion.getEfforts(); // now split data into parts double percentage = 0.5; // 0.5 as default value String param = config.getExecutionStrategyParameters(); if (config.getExecutionStrategyParameters() != null) { try { percentage = Double.parseDouble(param); } catch (NumberFormatException e) { throw new RuntimeException( "invalid execution strategy parameter, must be numeric: " + param); } } int initialTestSize = testdata.size(); Instances traindata = new Instances(testdata); for (int i = initialTestSize - 1; i >= 0; i--) { if ((((double) i) / initialTestSize) < percentage) { testdata.delete(i); if (efforts != null) { efforts.remove(i); } } else { traindata.delete(i); } } for (IProcessesingStrategy processor : config.getPreProcessors()) { Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); processor.apply(testdata, traindata); } for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) { Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); traindata = dataselector.apply(testdata, traindata); } for (IProcessesingStrategy processor : config.getPostProcessors()) { Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); processor.apply(testdata, traindata); } for (ITrainingStrategy trainer : config.getTrainers()) { Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), trainer.getName())); trainer.apply(traindata); } for (ITestAwareTrainingStrategy trainer : config.getTestAwareTrainers()) { Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), trainer.getName())); trainer.apply(testdata, traindata); } File resultsDir = new File(config.getResultsPath()); if (!resultsDir.exists()) { resultsDir.mkdir(); } for (IEvaluationStrategy evaluator : config.getEvaluators()) { Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), evaluator.getClass().getName())); List<ITrainer> allTrainers = new LinkedList<>(); for (ITrainingStrategy trainer : config.getTrainers()) { allTrainers.add(trainer); } for (ITestAwareTrainingStrategy trainer : config.getTestAwareTrainers()) { allTrainers.add(trainer); } if (writeHeader) { evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv"); } evaluator.apply(testdata, traindata, allTrainers, efforts, writeHeader, config.getResultStorages()); writeHeader = false; } Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); versionCount++; } } }
From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java
License:Apache License
@Override public Instances load(File fileMetricsFile) { // first determine all files String path = fileMetricsFile.getParentFile().getAbsolutePath(); String project = fileMetricsFile.getName().split("_")[0]; File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv"); File networkMetrics = new File(path + "/" + project + "_network_metrics.csv"); Instances metricsData = null;/*from w ww.j a v a 2 s . c o m*/ try { CSVLoader wekaCsvLoader = new CSVLoader(); wekaCsvLoader.setSource(fileMetricsFile); metricsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(bugsFile); Instances bugsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(networkMetrics); Instances networkData = wekaCsvLoader.getDataSet(); metricsData.setRelationName(project); // fix nominal attributes (i.e., NA values) for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isNominal()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } // fix string attributes for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isString()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } Map<String, Integer> filenames = new HashMap<>(); for (int j = 0; j < metricsData.size(); j++) { filenames.put(metricsData.instance(j).stringValue(0), j); } // merge with network data int attributeIndex; for (int j = 2; j < networkData.numAttributes(); j++) { attributeIndex = metricsData.numAttributes(); metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex); for (int i = 0; i < networkData.size(); i++) { Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, networkData.instance(i).value(j)); } } } // add bug information attributeIndex = metricsData.numAttributes(); final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); metricsData.insertAttributeAt(classAtt, attributeIndex); for (int i = 0; i < bugsData.size(); i++) { if (bugsData.instance(i).value(2) > 0.0d) { Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0); } } } // remove filenames metricsData.deleteAttributeAt(0); Attribute eigenvector = metricsData.attribute("eigenvector"); if (eigenvector != null) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.attribute(j) == eigenvector) { metricsData.deleteAttributeAt(j); } } } metricsData.setClassIndex(metricsData.numAttributes() - 1); // set all missing values to 0 for (int i = 0; i < metricsData.size(); i++) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.instance(i).isMissing(j)) { metricsData.instance(i).setValue(j, 0.0d); } } } } catch (IOException e) { Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage()); metricsData = null; } return metricsData; }
From source file:de.ugoe.cs.cpdp.training.MetricMatchingTraining.java
License:Apache License
/** * We need the test data instances to do a metric matching, so in this special case we get this * data before evaluation./*from w ww . j a va2 s .c om*/ */ @Override public void apply(SetUniqueList<Instances> traindataSet, Instances testdata) { // reset these for each run this.mm = null; this.classifier = null; double score = 0; // matching score to select the best matching training data from the set int num = 0; int biggest_num = 0; MetricMatch tmp; for (Instances traindata : traindataSet) { num++; tmp = new MetricMatch(traindata, testdata); // metric selection may create error, continue to next training set try { tmp.attributeSelection(); tmp.matchAttributes(this.method, this.threshold); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } // we only select the training data from our set with the most matching attributes if (tmp.getScore() > score && tmp.attributes.size() > 0) { score = tmp.getScore(); this.mm = tmp; biggest_num = num; } } // if we have found a matching instance we use it, log information about the match for // additional eval later Instances ilist = null; if (this.mm != null) { ilist = this.mm.getMatchedTrain(); Console.traceln(Level.INFO, "[MATCH FOUND] match: [" + biggest_num + "], score: [" + score + "], instances: [" + ilist.size() + "], attributes: [" + this.mm.attributes.size() + "], ilist attrs: [" + ilist.numAttributes() + "]"); for (Map.Entry<Integer, Integer> attmatch : this.mm.attributes.entrySet()) { Console.traceln(Level.INFO, "[MATCHED ATTRIBUTE] source attribute: [" + this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: [" + this.mm.test.attribute(attmatch.getValue()).name() + "]"); } } else { Console.traceln(Level.INFO, "[NO MATCH FOUND]"); } // if we have a match we build the MetricMatchingClassifier, if not we fall back to FixClass // Classifier try { if (this.mm != null) { this.classifier = new MetricMatchingClassifier(); this.classifier.buildClassifier(ilist); ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm); } else { this.classifier = new FixClass(); this.classifier.buildClassifier(ilist); // this is null, but the FixClass Classifier // does not use it anyway } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>//from w w w . jav a2 s .com * Upscales the value of a single attribute. This is a workaround to get BayesNet running for * all data. Works on a copy of the training data, i.e., leaves the original data untouched. * </p> * * @param traindata * data from which the attribute is upscaled. * @param attributeIndex * index of the attribute * @return data with upscaled attribute */ public static Instances upscaleAttribute(Instances traindata, int attributeIndex) { Instances traindataCopy = new Instances(traindata); for (int i = 0; i < traindata.size(); i++) { traindataCopy.get(i).setValue(attributeIndex, traindata.get(i).value(attributeIndex) * SCALER); } return traindataCopy; }
From source file:de.unidue.langtech.grading.tc.LearningCurveTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { boolean multiLabel = false; for (Integer numberInstances : NUMBER_OF_TRAINING_INSTANCES) { for (int iteration = 0; iteration < ITERATIONS; iteration++) { File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY) .getPath() + "/" + TRAINING_DATA_FILENAME); File arffFileTest = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); Instances testData = TaskUtils.getInstances(arffFileTest, multiLabel); if (numberInstances > trainData.size()) { continue; }//from w w w . j av a2s. c o m Classifier cl = AbstractClassifier.forName(classificationArguments.get(0), classificationArguments.subList(1, classificationArguments.size()).toArray(new String[0])); Instances copyTestData = new Instances(testData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); testData = WekaUtils.removeOutcomeId(testData, multiLabel); Random generator = new Random(); generator.setSeed(System.nanoTime()); trainData.randomize(generator); // remove fraction of training data that should not be used for training for (int i = trainData.size() - 1; i >= numberInstances; i--) { trainData.delete(i); } // file to hold prediction results File evalOutput = new File( aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE).getPath() + "/" + EVALUATION_DATA_FILENAME + "_" + numberInstances + "_" + iteration); // train the classifier on the train set split - not necessary in multilabel setup, but // in single label setup cl.buildClassifier(trainData); weka.core.SerializationHelper.write(evalOutput.getAbsolutePath(), WekaUtils.getEvaluationSinglelabel(cl, trainData, testData)); testData = WekaUtils.getPredictionInstancesSingleLabel(testData, cl); testData = WekaUtils.addOutcomeId(testData, copyTestData, false); // // Write out the predictions // DataSink.write(aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE) // .getAbsolutePath() + "/" + PREDICTIONS_FILENAME + "_" + trainPercent, testData); } } }
From source file:de.upb.timok.utils.DatasetTransformationUtils.java
License:Open Source License
public static List<double[]> instancesToDoubles(Instances instances, boolean chopClassAttribute) { final List<double[]> result = new ArrayList<>(); for (int i = 0; i < instances.size(); i++) { final Instance instance = instances.get(i); double[] temp = instance.toDoubleArray(); if (chopClassAttribute) { temp = Arrays.copyOfRange(temp, 0, temp.length - 1); }/*from w w w. j av a 2 s. c om*/ result.add(temp); } return result; }