Example usage for weka.core Instances classIndex

Introduction

In this page you can find the example usage for weka.core Instances classIndex.

Prototype


publicint classIndex()

Source Link

Document

Returns the class attribute's index.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>//from w w w.j a  va2 s. c o m
 * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param data
 *            data that is normalized
 */
public static void minMax(Instances data) {
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.classIndex() != j) {
            double min = data.attributeStats(j).numericStats.min;
            double max = data.attributeStats(j).numericStats.max;

            for (int i = 0; i < data.numInstances(); i++) {
                Instance inst = data.instance(i);
                double newValue = (inst.value(j) - min) / (max - min);
                inst.setValue(j, newValue);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>//from  w  w w.  j a v a2s.c  o m
 * Z-Score normalization (N2 in Transfer Defect Learning by Nam et al.).
 * </p>
 *
 * @param data
 *            data that is normalized
 */
public static void zScore(Instances data) {
    final double[] mean = new double[data.numAttributes()];
    final double[] std = new double[data.numAttributes()];

    // get means and stddevs of data
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.classIndex() != j) {
            mean[j] = data.meanOrMode(j);
            std[j] = Math.sqrt(data.variance(j));
        }
    }
    applyZScore(data, mean, std);
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*from   ww  w  . j  a va2s.  c  o m*/
 * Z-Score normalization using the mean and std of the training data (N3 in Transfer Defect
 * Learning by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindata
 *            training data
 */
public static void zScoreTraining(Instances testdata, Instances traindata) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of training
    for (int j = 0; j < traindata.numAttributes(); j++) {
        if (traindata.classIndex() != j) {
            mean[j] = traindata.meanOrMode(j);
            std[j] = Math.sqrt(traindata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    applyZScore(traindata, mean, std);
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*  w  w w.ja  v  a  2 s .  c  om*/
 * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindata
 *            training data
 */
public static void zScoreTarget(Instances testdata, Instances traindata) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of testdata
    for (int j = 0; j < testdata.numAttributes(); j++) {
        if (testdata.classIndex() != j) {
            mean[j] = testdata.meanOrMode(j);
            std[j] = Math.sqrt(testdata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    applyZScore(traindata, mean, std);
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>//ww  w  .  j a  v a2  s  . com
 * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindataSet
 *            training data
 */
public static void zScoreTarget(Instances testdata, SetUniqueList<Instances> traindataSet) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of testdata
    for (int j = 0; j < testdata.numAttributes(); j++) {
        if (testdata.classIndex() != j) {
            mean[j] = testdata.meanOrMode(j);
            std[j] = Math.sqrt(testdata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    for (Instances traindata : traindataSet) {
        applyZScore(traindata, mean, std);
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>//  ww  w . ja va  2s  .co  m
 * Internal helper function
 * </p>
 */
private static void applyZScore(Instances data, double[] mean, double[] std) {
    for (int i = 0; i < data.numInstances(); i++) {
        Instance instance = data.instance(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (data.classIndex() != j) {
                instance.setValue(j, instance.value(j) - mean[j] / std[j]);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.Oversampling.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {

    final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
    if (counts[1] < counts[0]) {
        Instances negatives = new Instances(traindata);
        Instances positives = new Instances(traindata);

        for (int i = traindata.size() - 1; i >= 0; i--) {
            if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
                negatives.remove(i);/*from w w w . j a va2s .  c  o  m*/
            }
            if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
                positives.remove(i);
            }
        }

        Resample resample = new Resample();
        resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]);
        try {
            resample.setInputFormat(traindata);
            positives = Filter.useFilter(positives, resample);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        traindata.clear();
        for (int i = 0; i < negatives.size(); i++) {
            traindata.add(negatives.get(i));
        }
        for (int i = 0; i < positives.size(); i++) {
            traindata.add(positives.get(i));
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java

License:Apache License

/**
 * <p>//from   w  w w .  jav  a2s  .  c o  m
 * Applies the synonym pruning based on the training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applySynonymPruning(Instances testdata, Instances traindata) {
    double distance;
    for (int j = traindata.numAttributes() - 1; j >= 0; j--) {
        if (j != traindata.classIndex()) {
            boolean hasClosest = false;
            for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) {
                for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        double minVal = Double.MAX_VALUE;
                        double distanceJ = Double.MAX_VALUE;
                        for (int k = 0; k < traindata.numAttributes(); k++) {
                            distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k));
                            if (distance < minVal) {
                                minVal = distance;
                            }
                            if (k == j) {
                                distanceJ = distance;
                            }
                        }
                        hasClosest = distanceJ <= minVal;
                    }
                }
            }
            if (!hasClosest) {
                testdata.deleteAttributeAt(j);
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java

License:Apache License

private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet)
        throws Exception {
    Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1];
    IntStream.range(0, counts.length).forEach(val -> counts[val] = 0);
    for (Instances traindata : traindataSet) {
        J48 decisionTree = new J48();
        decisionTree.buildClassifier(traindata);
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                if (decisionTree.toString().contains(traindata.attribute(j).name())) {
                    counts[k] = counts[k] + 1;
                }/*w  w  w .  j a va  2  s  . c  o m*/
                k++;
            }
        }
    }
    int[] topkIndex = new int[counts.length];
    IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val);
    SortUtils.quicksort(counts, topkIndex, true);

    // get CFSs for each training set
    List<Set<Integer>> cfsSets = new LinkedList<>();
    for (Instances traindata : traindataSet) {
        boolean selectionSuccessful = false;
        boolean secondAttempt = false;
        Instances traindataCopy = null;
        do {
            try {
                if (secondAttempt) {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindataCopy);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                } else {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindata);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                }
            } catch (IllegalArgumentException e) {
                String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*";
                Pattern p = Pattern.compile(regex);
                Matcher m = p.matcher(e.getMessage());
                if (!m.find()) {
                    // cannot treat problem, rethrow exception
                    throw e;
                }
                String attributeName = m.group(1);
                int attrIndex = traindata.attribute(attributeName).index();
                if (secondAttempt) {
                    traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex);
                } else {
                    traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex);
                }
                Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training");
                secondAttempt = true;
                continue;
            }
        } while (!selectionSuccessful); // dummy loop for internal continue
    }

    double[] coverages = new double[topkIndex.length];
    for (Set<Integer> cfsSet : cfsSets) {
        Set<Integer> topkSet = new HashSet<>();
        for (int k = 0; k < topkIndex.length; k++) {
            topkSet.add(topkIndex[k]);
            coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size());
        }
    }
    double bestCoverageValue = Double.MIN_VALUE;
    int bestCoverageIndex = 0;
    for (int i = 0; i < coverages.length; i++) {
        if (coverages[i] > bestCoverageValue) {
            bestCoverageValue = coverages[i];
            bestCoverageIndex = i;
        }
    }
    // build correlation matrix
    SpearmansCorrelation corr = new SpearmansCorrelation();
    double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex];
    for (Instances traindata : traindataSet) {
        double[][] vectors = new double[bestCoverageIndex][traindata.size()];
        for (int i = 0; i < traindata.size(); i++) {
            for (int j = 0; j < bestCoverageIndex; j++) {
                vectors[j][i] = traindata.get(i).value(topkIndex[j]);
            }
        }
        for (int j = 0; j < bestCoverageIndex; j++) {
            for (int k = j + 1; k < bestCoverageIndex; k++) {
                correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k]));
            }
        }
    }
    Set<Integer> topkSetIndexSet = new TreeSet<>();
    // j<30 ensures that the computational time does not explode since the powerset is 2^n in
    // complexity
    for (int j = 0; j < bestCoverageIndex && j < 30; j++) {
        topkSetIndexSet.add(j);
    }
    Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet);
    double bestOptCoverage = Double.MIN_VALUE;
    Set<Integer> opttopkSetIndexSet = null;
    for (Set<Integer> combination : allCombinations) {
        if (isUncorrelated(correlationMatrix, combination)) {
            double currentCoverage = 0.0;
            Set<Integer> topkCombination = new TreeSet<>();
            for (Integer index : combination) {
                topkCombination.add(topkIndex[index]);
            }
            for (Set<Integer> cfsSet : cfsSets) {
                currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size());
            }
            if (currentCoverage > bestOptCoverage) {
                bestOptCoverage = currentCoverage;
                opttopkSetIndexSet = combination;
            }
        }
    }
    Set<Integer> opttopkIndex = new TreeSet<>();
    for (Integer index : opttopkSetIndexSet) {
        opttopkIndex.add(topkIndex[index]);
    }
    Console.traceln(Level.FINE, "selected the following metrics:");
    for (Integer index : opttopkIndex) {
        Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name());
    }
    // finally remove attributes
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex() && !opttopkIndex.contains(j)) {
            testdata.deleteAttributeAt(j);
            for (Instances traindata : traindataSet) {
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>//from  www  .  j  ava 2 s  . c  o  m
 * Applies TCA to the test and training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applyTCA(Instances testdata, Instances traindata) {
    final int sizeTest = testdata.numInstances();
    final int sizeTrain = traindata.numInstances();
    final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata);
    final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in
                                                                                         // the
                                                                                         // paper
    final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the
                                                                                 // paper
    final double mu = 1.0; // default from the MATLAB implementation
    final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu);
    PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain);

    Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")");
    final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix)
            .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix)
            .multiplyRight(centerMatrix).multiplyRight(kernelMatrix);
    Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem");
    General eigenvalueDecomposition = new JamaEigenvalue.General();
    eigenvalueDecomposition.compute(optimizationProblem);
    Console.traceln(Level.FINEST, "eigenvalue problem solved");

    Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues();
    System.out.println(eigenvaluesArray.length);
    final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length];
    final int[] index = new int[(int) eigenvaluesArray.length];
    // create kernel transformation matrix from eigenvectors
    for (int i = 0; i < eigenvaluesArray.length; i++) {
        eigenvalues[i] = eigenvaluesArray.doubleValue(i);
        index[i] = i;
    }
    SortUtils.quicksort(eigenvalues, index);

    final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight(
            eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension)));

    // update testdata and traindata
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex()) {
            testdata.deleteAttributeAt(j);
            traindata.deleteAttributeAt(j);
        }
    }
    for (int j = 0; j < reducedDimension; j++) {
        testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
        traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
    }
    for (int i = 0; i < sizeTrain; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j));
        }
    }
    for (int i = 0; i < sizeTest; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j));
        }
    }
}