Example usage for weka.core Instances size

Introduction

In this page you can find the example usage for weka.core Instances size.

Prototype


@Override
publicint size()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:de.ugoe.cs.cpdp.dataselection.LACE2.java

License:Apache License

@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    Instances selectedData = new Instances(testdata);
    selectedData.clear();/*from  ww  w .j a  v  a2s . c  o m*/

    LinkedList<Instances> traindataCopy = new LinkedList<>(traindataSet);
    Collections.shuffle(traindataCopy);

    CLIFF cliff = new CLIFF();
    cliff.setParameter(Double.toString(percentage));
    MORPH morph = new MORPH();
    Median median = new Median();
    double minDist = Double.MIN_VALUE;

    for (Instances traindata : traindataCopy) {
        Instances cliffedData = cliff.applyCLIFF(traindata);
        if (minDist == Double.MIN_VALUE) {
            // determine distance for leader-follower algorithm
            Instances sample;
            if (traindata.size() > 100) {
                Resample resample = new Resample();
                resample.setSampleSizePercent(100.0 / traindata.size() * 100.0);
                resample.setBiasToUniformClass(0.0);
                resample.setNoReplacement(true);
                try {
                    resample.setInputFormat(traindata);
                    sample = Filter.useFilter(traindata, resample);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            } else {
                sample = new Instances(traindata);
            }
            double[] distances = new double[sample.size()];
            for (int i = 0; i < sample.size(); i++) {
                Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(sample.get(i), sample);
                distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)),
                        WekaUtils.instanceValues(unlikeNeighbor));
            }
            minDist = median.evaluate(distances);
        }
        for (int i = 0; i < cliffedData.size(); i++) {
            Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData);
            if (unlikeNeighbor == null) {
                selectedData.add(cliffedData.get(i));
            } else {
                double distance = MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)),
                        WekaUtils.instanceValues(unlikeNeighbor));
                if (distance > minDist) {
                    morph.morphInstance(cliffedData.get(i), cliffedData);
                    selectedData.add(cliffedData.get(i));
                }
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.MahalanobisOutlierRemoval.java

License:Apache License

/**
 * <p>//from   w  w w .  j  a  v a2 s. co m
 * removes all instances, whose Mahalanobi distance to the mean of the data is greater than
 * epsilon.
 * </p>
 *
 * @param data
 *            data where the outliers are removed
 */
private void applyMahalanobisDistancesRemoval(Instances data) {
    RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1);
    for (int i = 0; i < data.size(); i++) {
        values.setRow(i, WekaUtils.instanceValues(data.get(i)));
    }
    RealMatrix inverseCovariance;
    try {
        inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver()
                .getInverse();
    } catch (SingularMatrixException e) {
        Console.traceln(Level.WARNING,
                "could not perform Mahalanobis outlier removal due to singular covariance matrix");
        return;
    }
    // create mean vector
    double[] meanValues = new double[data.numAttributes() - 1];
    int k = 0;
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            meanValues[k] = data.attributeStats(j).numericStats.mean;
            k++;
        }
    }

    for (int i = data.size() - 1; i >= 0; i--) {
        double distance = mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)),
                meanValues);
        if (distance > epsilon) {
            data.remove(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.NeighborhoodFilter.java

License:Apache License

/**
 * <p>//  ww  w.  j a  v a 2 s .co m
 * Applies the relevancy filter after Ryu et al.
 * </p>
 *
 * @param testdata
 *            test data
 * @param traindata
 *            training data
 * @return filtered trainind data
 */
private Instances applyNeighborhoodFilter(Instances testdata, Instances traindata) {
    TreeSet<Integer> selectedInstances = new TreeSet<>();
    for (int i = 0; i < testdata.size(); i++) {
        double minHam = Double.MAX_VALUE;
        for (int j = 0; j < traindata.size(); j++) {
            double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j));
            if (distance < minHam) {
                minHam = distance;
            }
        }
        for (int j = 0; j < traindata.size(); j++) {
            double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j));
            if (distance <= minHam) {
                selectedInstances.add(j);
            }
        }
    }
    Instances selectedTraindata = new Instances(testdata);
    selectedTraindata.clear();
    for (Integer index : selectedInstances) {
        selectedTraindata.add(traindata.instance(index));
    }
    return selectedTraindata;
}

From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java

License:Apache License

/**
 * <p>//from   w w w  .  j  a v a  2 s .  c o  m
 * Applies the synonym outlier removal.
 * </p>
 *
 * @param traindata
 *            data from which the outliers are removed.
 */
public void applySynonymRemoval(Instances traindata) {
    double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1];
    double minDistanceAttribute[] = new double[traindata.numAttributes() - 1];
    double distance;
    for (int j = 0; j < minDistanceAttribute.length; j++) {
        minDistanceAttribute[j] = Double.MAX_VALUE;
    }
    for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                minDistance[i1][k] = Double.MAX_VALUE;
                for (int i2 = 0; i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j));
                        if (distance < minDistance[i1][k]) {
                            minDistance[i1][k] = distance;
                        }
                        if (distance < minDistanceAttribute[k]) {
                            minDistanceAttribute[k] = distance;
                        }
                    }
                }
                k++;
            }
        }
    }
    for (int i = traindata.size() - 1; i >= 0; i--) {
        boolean hasClosest = false;
        for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) {
            hasClosest = minDistance[i][j] <= minDistanceAttribute[j];
        }
        if (!hasClosest) {
            traindata.delete(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.execution.WithinProjectOrderedSplitExperiment.java

License:Apache License

/**
 * Executes the experiment with the steps as described in the class comment.
 * /*from  w  w w . jav a  2s.c om*/
 * @see Runnable#run()
 */
@Override
public void run() {
    final List<SoftwareVersion> versions = new LinkedList<>();

    for (IVersionLoader loader : config.getLoaders()) {
        versions.addAll(loader.load());
    }

    for (IVersionFilter filter : config.getVersionFilters()) {
        filter.apply(versions);
    }
    boolean writeHeader = true;
    int versionCount = 1;
    int testVersionCount = 0;
    int numTrainers = 0;

    for (SoftwareVersion testVersion : versions) {
        if (isVersion(testVersion, config.getTestVersionFilters())) {
            testVersionCount++;
        }
    }

    numTrainers += config.getSetWiseTrainers().size();
    numTrainers += config.getSetWiseTestdataAwareTrainers().size();
    numTrainers += config.getTrainers().size();
    numTrainers += config.getTestAwareTrainers().size();

    // sort versions
    Collections.sort(versions);

    for (SoftwareVersion testVersion : versions) {
        if (isVersion(testVersion, config.getTestVersionFilters())) {
            Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting",
                    config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion()));
            int numResultsAvailable = resultsAvailable(testVersion);
            if (numResultsAvailable >= numTrainers * config.getRepetitions()) {
                Console.traceln(Level.INFO,
                        String.format("[%s] [%02d/%02d] %s: results already available; skipped",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion()));
                versionCount++;
                continue;
            }

            // Setup testdata and training data
            Instances testdata = testVersion.getInstances();
            List<Double> efforts = testVersion.getEfforts();

            // now split data into parts
            double percentage = 0.5; // 0.5 as default value
            String param = config.getExecutionStrategyParameters();
            if (config.getExecutionStrategyParameters() != null) {
                try {
                    percentage = Double.parseDouble(param);
                } catch (NumberFormatException e) {
                    throw new RuntimeException(
                            "invalid execution strategy parameter, must be numeric: " + param);
                }
            }
            int initialTestSize = testdata.size();
            Instances traindata = new Instances(testdata);
            for (int i = initialTestSize - 1; i >= 0; i--) {
                if ((((double) i) / initialTestSize) < percentage) {
                    testdata.delete(i);
                    if (efforts != null) {
                        efforts.remove(i);
                    }
                } else {
                    traindata.delete(i);
                }
            }

            for (IProcessesingStrategy processor : config.getPreProcessors()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying preprocessor %s",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion(), processor.getClass().getName()));
                processor.apply(testdata, traindata);
            }
            for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion(), dataselector.getClass().getName()));
                traindata = dataselector.apply(testdata, traindata);
            }
            for (IProcessesingStrategy processor : config.getPostProcessors()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion(), processor.getClass().getName()));
                processor.apply(testdata, traindata);
            }
            for (ITrainingStrategy trainer : config.getTrainers()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(),
                                versionCount, testVersionCount, testVersion.getVersion(), trainer.getName()));
                trainer.apply(traindata);
            }
            for (ITestAwareTrainingStrategy trainer : config.getTestAwareTrainers()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(),
                                versionCount, testVersionCount, testVersion.getVersion(), trainer.getName()));
                trainer.apply(testdata, traindata);
            }
            File resultsDir = new File(config.getResultsPath());
            if (!resultsDir.exists()) {
                resultsDir.mkdir();
            }
            for (IEvaluationStrategy evaluator : config.getEvaluators()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(),
                                versionCount, testVersionCount, testVersion.getVersion(),
                                evaluator.getClass().getName()));
                List<ITrainer> allTrainers = new LinkedList<>();
                for (ITrainingStrategy trainer : config.getTrainers()) {
                    allTrainers.add(trainer);
                }
                for (ITestAwareTrainingStrategy trainer : config.getTestAwareTrainers()) {
                    allTrainers.add(trainer);
                }
                if (writeHeader) {
                    evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv");
                }
                evaluator.apply(testdata, traindata, allTrainers, efforts, writeHeader,
                        config.getResultStorages());
                writeHeader = false;
            }
            Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished",
                    config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion()));
            versionCount++;
        }
    }
}

From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java

License:Apache License

@Override
public Instances load(File fileMetricsFile) {
    // first determine all files
    String path = fileMetricsFile.getParentFile().getAbsolutePath();
    String project = fileMetricsFile.getName().split("_")[0];
    File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
    File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
    Instances metricsData = null;/*from   w  ww.j  a v  a 2  s . c o m*/

    try {
        CSVLoader wekaCsvLoader = new CSVLoader();
        wekaCsvLoader.setSource(fileMetricsFile);
        metricsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(bugsFile);
        Instances bugsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(networkMetrics);
        Instances networkData = wekaCsvLoader.getDataSet();

        metricsData.setRelationName(project);

        // fix nominal attributes (i.e., NA values)
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isNominal()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }
        // fix string attributes
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isString()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }

        Map<String, Integer> filenames = new HashMap<>();
        for (int j = 0; j < metricsData.size(); j++) {
            filenames.put(metricsData.instance(j).stringValue(0), j);
        }
        // merge with network data
        int attributeIndex;
        for (int j = 2; j < networkData.numAttributes(); j++) {
            attributeIndex = metricsData.numAttributes();
            metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
            for (int i = 0; i < networkData.size(); i++) {
                Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex,
                            networkData.instance(i).value(j));
                }
            }
        }

        // add bug information
        attributeIndex = metricsData.numAttributes();
        final ArrayList<String> classAttVals = new ArrayList<String>();
        classAttVals.add("0");
        classAttVals.add("1");
        final Attribute classAtt = new Attribute("bug", classAttVals);
        metricsData.insertAttributeAt(classAtt, attributeIndex);
        for (int i = 0; i < bugsData.size(); i++) {
            if (bugsData.instance(i).value(2) > 0.0d) {
                Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
                }
            }
        }

        // remove filenames
        metricsData.deleteAttributeAt(0);
        Attribute eigenvector = metricsData.attribute("eigenvector");
        if (eigenvector != null) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.attribute(j) == eigenvector) {
                    metricsData.deleteAttributeAt(j);
                }
            }
        }

        metricsData.setClassIndex(metricsData.numAttributes() - 1);

        // set all missing values to 0
        for (int i = 0; i < metricsData.size(); i++) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.instance(i).isMissing(j)) {
                    metricsData.instance(i).setValue(j, 0.0d);
                }
            }
        }
    } catch (IOException e) {
        Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
        metricsData = null;
    }
    return metricsData;
}

From source file:de.ugoe.cs.cpdp.training.MetricMatchingTraining.java

License:Apache License

/**
 * We need the test data instances to do a metric matching, so in this special case we get this
 * data before evaluation./*from w  ww . j a va2 s .c om*/
 */
@Override
public void apply(SetUniqueList<Instances> traindataSet, Instances testdata) {
    // reset these for each run
    this.mm = null;
    this.classifier = null;

    double score = 0; // matching score to select the best matching training data from the set
    int num = 0;
    int biggest_num = 0;
    MetricMatch tmp;
    for (Instances traindata : traindataSet) {
        num++;

        tmp = new MetricMatch(traindata, testdata);

        // metric selection may create error, continue to next training set
        try {
            tmp.attributeSelection();
            tmp.matchAttributes(this.method, this.threshold);
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }

        // we only select the training data from our set with the most matching attributes
        if (tmp.getScore() > score && tmp.attributes.size() > 0) {
            score = tmp.getScore();
            this.mm = tmp;
            biggest_num = num;
        }
    }

    // if we have found a matching instance we use it, log information about the match for
    // additional eval later
    Instances ilist = null;
    if (this.mm != null) {
        ilist = this.mm.getMatchedTrain();
        Console.traceln(Level.INFO,
                "[MATCH FOUND] match: [" + biggest_num + "], score: [" + score + "], instances: ["
                        + ilist.size() + "], attributes: [" + this.mm.attributes.size() + "], ilist attrs: ["
                        + ilist.numAttributes() + "]");
        for (Map.Entry<Integer, Integer> attmatch : this.mm.attributes.entrySet()) {
            Console.traceln(Level.INFO,
                    "[MATCHED ATTRIBUTE] source attribute: ["
                            + this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: ["
                            + this.mm.test.attribute(attmatch.getValue()).name() + "]");
        }
    } else {
        Console.traceln(Level.INFO, "[NO MATCH FOUND]");
    }

    // if we have a match we build the MetricMatchingClassifier, if not we fall back to FixClass
    // Classifier
    try {
        if (this.mm != null) {
            this.classifier = new MetricMatchingClassifier();
            this.classifier.buildClassifier(ilist);
            ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm);
        } else {
            this.classifier = new FixClass();
            this.classifier.buildClassifier(ilist); // this is null, but the FixClass Classifier
                                                    // does not use it anyway
        }
    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>//from  w w  w  . jav a2 s  .com
 * Upscales the value of a single attribute. This is a workaround to get BayesNet running for
 * all data. Works on a copy of the training data, i.e., leaves the original data untouched.
 * </p>
 *
 * @param traindata
 *            data from which the attribute is upscaled.
 * @param attributeIndex
 *            index of the attribute
 * @return data with upscaled attribute
 */
public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
    Instances traindataCopy = new Instances(traindata);
    for (int i = 0; i < traindata.size(); i++) {
        traindataCopy.get(i).setValue(attributeIndex, traindata.get(i).value(attributeIndex) * SCALER);
    }
    return traindataCopy;
}

From source file:de.unidue.langtech.grading.tc.LearningCurveTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    boolean multiLabel = false;

    for (Integer numberInstances : NUMBER_OF_TRAINING_INSTANCES) {
        for (int iteration = 0; iteration < ITERATIONS; iteration++) {
            File arffFileTrain = new File(
                    aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY)
                            .getPath() + "/" + TRAINING_DATA_FILENAME);
            File arffFileTest = new File(
                    aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY).getPath()
                            + "/" + TRAINING_DATA_FILENAME);

            Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);
            Instances testData = TaskUtils.getInstances(arffFileTest, multiLabel);

            if (numberInstances > trainData.size()) {
                continue;
            }//from   w w  w . j av  a2s.  c o  m

            Classifier cl = AbstractClassifier.forName(classificationArguments.get(0),
                    classificationArguments.subList(1, classificationArguments.size()).toArray(new String[0]));

            Instances copyTestData = new Instances(testData);
            trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);
            testData = WekaUtils.removeOutcomeId(testData, multiLabel);

            Random generator = new Random();
            generator.setSeed(System.nanoTime());

            trainData.randomize(generator);

            // remove fraction of training data that should not be used for training
            for (int i = trainData.size() - 1; i >= numberInstances; i--) {
                trainData.delete(i);
            }

            // file to hold prediction results
            File evalOutput = new File(
                    aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE).getPath() + "/"
                            + EVALUATION_DATA_FILENAME + "_" + numberInstances + "_" + iteration);

            // train the classifier on the train set split - not necessary in multilabel setup, but
            // in single label setup
            cl.buildClassifier(trainData);

            weka.core.SerializationHelper.write(evalOutput.getAbsolutePath(),
                    WekaUtils.getEvaluationSinglelabel(cl, trainData, testData));
            testData = WekaUtils.getPredictionInstancesSingleLabel(testData, cl);
            testData = WekaUtils.addOutcomeId(testData, copyTestData, false);

            //                // Write out the predictions
            //                DataSink.write(aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE)
            //                        .getAbsolutePath() + "/" + PREDICTIONS_FILENAME + "_" + trainPercent, testData); 
        }
    }
}

From source file:de.upb.timok.utils.DatasetTransformationUtils.java

License:Open Source License

public static List<double[]> instancesToDoubles(Instances instances, boolean chopClassAttribute) {
    final List<double[]> result = new ArrayList<>();
    for (int i = 0; i < instances.size(); i++) {
        final Instance instance = instances.get(i);
        double[] temp = instance.toDoubleArray();
        if (chopClassAttribute) {
            temp = Arrays.copyOfRange(temp, 0, temp.length - 1);
        }/*from   w  w w. j av a 2 s. c  om*/
        result.add(temp);
    }
    return result;
}