Example usage for weka.core Instances numAttributes

Introduction

In this page you can find the example usage for weka.core Instances numAttributes.

Prototype


publicint numAttributes()

Source Link

Document

Returns the number of attributes.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>//from   www.  j ava 2 s .co m
 * Z-Score normalization using the mean and std of the training data (N3 in Transfer Defect
 * Learning by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindata
 *            training data
 */
public static void zScoreTraining(Instances testdata, Instances traindata) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of training
    for (int j = 0; j < traindata.numAttributes(); j++) {
        if (traindata.classIndex() != j) {
            mean[j] = traindata.meanOrMode(j);
            std[j] = Math.sqrt(traindata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    applyZScore(traindata, mean, std);
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*  ww  w.  jav  a  2s  . c  o  m*/
 * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindata
 *            training data
 */
public static void zScoreTarget(Instances testdata, Instances traindata) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of testdata
    for (int j = 0; j < testdata.numAttributes(); j++) {
        if (testdata.classIndex() != j) {
            mean[j] = testdata.meanOrMode(j);
            std[j] = Math.sqrt(testdata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    applyZScore(traindata, mean, std);
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*from  w  ww  . java2 s  .  c o m*/
 * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindataSet
 *            training data
 */
public static void zScoreTarget(Instances testdata, SetUniqueList<Instances> traindataSet) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of testdata
    for (int j = 0; j < testdata.numAttributes(); j++) {
        if (testdata.classIndex() != j) {
            mean[j] = testdata.meanOrMode(j);
            std[j] = Math.sqrt(testdata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    for (Instances traindata : traindataSet) {
        applyZScore(traindata, mean, std);
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*from  ww w .  j  a  va 2  s .  co  m*/
 * Internal helper function
 * </p>
 */
private static void applyZScore(Instances data, double[] mean, double[] std) {
    for (int i = 0; i < data.numInstances(); i++) {
        Instance instance = data.instance(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (data.classIndex() != j) {
                instance.setValue(j, instance.value(j) - mean[j] / std[j]);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java

License:Apache License

/**
 * <p>//from   www  .  j  av  a2  s  .  co m
 * Applies the synonym pruning based on the training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applySynonymPruning(Instances testdata, Instances traindata) {
    double distance;
    for (int j = traindata.numAttributes() - 1; j >= 0; j--) {
        if (j != traindata.classIndex()) {
            boolean hasClosest = false;
            for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) {
                for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        double minVal = Double.MAX_VALUE;
                        double distanceJ = Double.MAX_VALUE;
                        for (int k = 0; k < traindata.numAttributes(); k++) {
                            distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k));
                            if (distance < minVal) {
                                minVal = distance;
                            }
                            if (k == j) {
                                distanceJ = distance;
                            }
                        }
                        hasClosest = distanceJ <= minVal;
                    }
                }
            }
            if (!hasClosest) {
                testdata.deleteAttributeAt(j);
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java

License:Apache License

private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet)
        throws Exception {
    Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1];
    IntStream.range(0, counts.length).forEach(val -> counts[val] = 0);
    for (Instances traindata : traindataSet) {
        J48 decisionTree = new J48();
        decisionTree.buildClassifier(traindata);
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                if (decisionTree.toString().contains(traindata.attribute(j).name())) {
                    counts[k] = counts[k] + 1;
                }/* ww  w . j a va 2  s  . c o  m*/
                k++;
            }
        }
    }
    int[] topkIndex = new int[counts.length];
    IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val);
    SortUtils.quicksort(counts, topkIndex, true);

    // get CFSs for each training set
    List<Set<Integer>> cfsSets = new LinkedList<>();
    for (Instances traindata : traindataSet) {
        boolean selectionSuccessful = false;
        boolean secondAttempt = false;
        Instances traindataCopy = null;
        do {
            try {
                if (secondAttempt) {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindataCopy);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                } else {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindata);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                }
            } catch (IllegalArgumentException e) {
                String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*";
                Pattern p = Pattern.compile(regex);
                Matcher m = p.matcher(e.getMessage());
                if (!m.find()) {
                    // cannot treat problem, rethrow exception
                    throw e;
                }
                String attributeName = m.group(1);
                int attrIndex = traindata.attribute(attributeName).index();
                if (secondAttempt) {
                    traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex);
                } else {
                    traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex);
                }
                Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training");
                secondAttempt = true;
                continue;
            }
        } while (!selectionSuccessful); // dummy loop for internal continue
    }

    double[] coverages = new double[topkIndex.length];
    for (Set<Integer> cfsSet : cfsSets) {
        Set<Integer> topkSet = new HashSet<>();
        for (int k = 0; k < topkIndex.length; k++) {
            topkSet.add(topkIndex[k]);
            coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size());
        }
    }
    double bestCoverageValue = Double.MIN_VALUE;
    int bestCoverageIndex = 0;
    for (int i = 0; i < coverages.length; i++) {
        if (coverages[i] > bestCoverageValue) {
            bestCoverageValue = coverages[i];
            bestCoverageIndex = i;
        }
    }
    // build correlation matrix
    SpearmansCorrelation corr = new SpearmansCorrelation();
    double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex];
    for (Instances traindata : traindataSet) {
        double[][] vectors = new double[bestCoverageIndex][traindata.size()];
        for (int i = 0; i < traindata.size(); i++) {
            for (int j = 0; j < bestCoverageIndex; j++) {
                vectors[j][i] = traindata.get(i).value(topkIndex[j]);
            }
        }
        for (int j = 0; j < bestCoverageIndex; j++) {
            for (int k = j + 1; k < bestCoverageIndex; k++) {
                correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k]));
            }
        }
    }
    Set<Integer> topkSetIndexSet = new TreeSet<>();
    // j<30 ensures that the computational time does not explode since the powerset is 2^n in
    // complexity
    for (int j = 0; j < bestCoverageIndex && j < 30; j++) {
        topkSetIndexSet.add(j);
    }
    Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet);
    double bestOptCoverage = Double.MIN_VALUE;
    Set<Integer> opttopkSetIndexSet = null;
    for (Set<Integer> combination : allCombinations) {
        if (isUncorrelated(correlationMatrix, combination)) {
            double currentCoverage = 0.0;
            Set<Integer> topkCombination = new TreeSet<>();
            for (Integer index : combination) {
                topkCombination.add(topkIndex[index]);
            }
            for (Set<Integer> cfsSet : cfsSets) {
                currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size());
            }
            if (currentCoverage > bestOptCoverage) {
                bestOptCoverage = currentCoverage;
                opttopkSetIndexSet = combination;
            }
        }
    }
    Set<Integer> opttopkIndex = new TreeSet<>();
    for (Integer index : opttopkSetIndexSet) {
        opttopkIndex.add(topkIndex[index]);
    }
    Console.traceln(Level.FINE, "selected the following metrics:");
    for (Integer index : opttopkIndex) {
        Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name());
    }
    // finally remove attributes
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex() && !opttopkIndex.contains(j)) {
            testdata.deleteAttributeAt(j);
            for (Instances traindata : traindataSet) {
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>//from ww w.j  a va2s .com
 * Applies TCA to the test and training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applyTCA(Instances testdata, Instances traindata) {
    final int sizeTest = testdata.numInstances();
    final int sizeTrain = traindata.numInstances();
    final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata);
    final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in
                                                                                         // the
                                                                                         // paper
    final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the
                                                                                 // paper
    final double mu = 1.0; // default from the MATLAB implementation
    final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu);
    PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain);

    Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")");
    final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix)
            .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix)
            .multiplyRight(centerMatrix).multiplyRight(kernelMatrix);
    Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem");
    General eigenvalueDecomposition = new JamaEigenvalue.General();
    eigenvalueDecomposition.compute(optimizationProblem);
    Console.traceln(Level.FINEST, "eigenvalue problem solved");

    Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues();
    System.out.println(eigenvaluesArray.length);
    final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length];
    final int[] index = new int[(int) eigenvaluesArray.length];
    // create kernel transformation matrix from eigenvectors
    for (int i = 0; i < eigenvaluesArray.length; i++) {
        eigenvalues[i] = eigenvaluesArray.doubleValue(i);
        index[i] = i;
    }
    SortUtils.quicksort(eigenvalues, index);

    final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight(
            eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension)));

    // update testdata and traindata
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex()) {
            testdata.deleteAttributeAt(j);
            traindata.deleteAttributeAt(j);
        }
    }
    for (int j = 0; j < reducedDimension; j++) {
        testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
        traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
    }
    for (int i = 0; i < sizeTrain; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j));
        }
    }
    for (int i = 0; i < sizeTest; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j));
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>//from   w  w  w . jav  a  2 s.c  om
 * Applies the CLIFF relevancy filter to the data.
 * </p>
 *
 * @param data
 *            the data
 * @return CLIFF-filtered data
 */
protected Instances applyCLIFF(Instances data) {
    final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
    final double[] powerEntity = new double[data.size()];

    final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
    final double probDefect = data.numInstances() / (double) counts[1];

    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute()) {
            final double[] ranges = getRanges(data, j);
            final double[] probDefectRange = getRangeProbabilities(data, j, ranges);

            for (int i = 0; i < data.numInstances(); i++) {
                final double value = data.instance(i).value(j);
                final int range = determineRange(ranges, value);
                double probClass, probNotClass, probRangeClass, probRangeNotClass;
                if (data.instance(i).classValue() == 1) {
                    probClass = probDefect;
                    probNotClass = 1.0 - probDefect;
                    probRangeClass = probDefectRange[range];
                    probRangeNotClass = 1.0 - probDefectRange[range];
                } else {
                    probClass = 1.0 - probDefect;
                    probNotClass = probDefect;
                    probRangeClass = 1.0 - probDefectRange[range];
                    probRangeNotClass = probDefectRange[range];
                }
                powerAttributes[i][j] = Math.pow(probRangeClass, 2.0)
                        / (probRangeClass * probClass + probRangeNotClass * probNotClass);
            }
        }
    }

    for (int i = 0; i < data.numInstances(); i++) {
        powerEntity[i] = 1.0;
        for (int j = 0; j < data.numAttributes(); j++) {
            powerEntity[i] *= powerAttributes[i][j];
        }
    }
    double[] sortedPower = powerEntity.clone();
    Arrays.sort(sortedPower);
    double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];

    final Instances selected = new Instances(data);
    selected.delete();
    for (int i = 0; i < data.numInstances(); i++) {
        if (powerEntity[i] >= cutOff) {
            selected.add(data.instance(i));
        }
    }
    return selected;
}

From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java

License:Apache License

/**
 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances,
 *      weka.core.Instances)/*from  w  w w.j  ava  2 s  .  c o m*/
 */
@Override
public Instances apply(Instances testdata, Instances traindata) {
    Instances filteredTraindata = new Instances(traindata);
    filteredTraindata.clear();

    double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1];
    int classIndex = testdata.classIndex();
    for (int i = 0; i < testdata.size(); i++) {
        int k = 0;
        for (int j = 0; j < testdata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i][k] = testdata.get(i).value(j);
                k++;
            }
        }
    }
    for (int i = 0; i < traindata.size(); i++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i + testdata.size()][k] = traindata.get(i).value(j);
                k++;
            }
        }
    }
    DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data);
    Database db = new StaticArrayDatabase(dbc, null);
    db.initialize();
    DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10);
    Clustering<Model> clusterer = dbscan.run(db);
    Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD);
    int firstInternalIndex = rel.iterDBIDs().internalGetIndex();

    for (Cluster<Model> cluster : clusterer.getAllClusters()) {
        // check if cluster has any training data
        DBIDIter iter = rel.iterDBIDs();
        boolean noMatch = true;
        for (int i = 0; noMatch && i < testdata.size(); i++) {
            noMatch = !cluster.getIDs().contains(iter);
            iter.advance();
        }
        if (!noMatch) {
            // cluster contains test data
            for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) {
                int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex;
                if (internalIndex >= 0) {
                    // index belongs to a training instance
                    filteredTraindata.add(traindata.get(internalIndex));
                }
            }

        }
    }

    return filteredTraindata;
}

From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java

License:Apache License

@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    final Instances data = characteristicInstances(testdata, traindataSet);

    final ArrayList<String> attVals = new ArrayList<String>();
    attVals.add("same");
    attVals.add("more");
    attVals.add("less");
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();
    for (int j = 0; j < data.numAttributes(); j++) {
        atts.add(new Attribute(data.attribute(j).name(), attVals));
    }/*from w w w . j  a  v a2s .c o m*/
    atts.add(new Attribute("score"));
    Instances similarityData = new Instances("similarity", atts, 0);
    similarityData.setClassIndex(similarityData.numAttributes() - 1);

    try {
        Classifier classifier = new J48();
        for (int i = 0; i < traindataSet.size(); i++) {
            classifier.buildClassifier(traindataSet.get(i));
            for (int j = 0; j < traindataSet.size(); j++) {
                if (i != j) {
                    double[] similarity = new double[data.numAttributes() + 1];
                    for (int k = 0; k < data.numAttributes(); k++) {
                        if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) {
                            similarity[k] = 2.0;
                        } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) {
                            similarity[k] = 1.0;
                        } else {
                            similarity[k] = 0.0;
                        }
                    }

                    Evaluation eval = new Evaluation(traindataSet.get(j));
                    eval.evaluateModel(classifier, traindataSet.get(j));
                    similarity[data.numAttributes()] = eval.fMeasure(1);
                    similarityData.add(new DenseInstance(1.0, similarity));
                }
            }
        }
        REPTree repTree = new REPTree();
        if (repTree.getNumFolds() > similarityData.size()) {
            repTree.setNumFolds(similarityData.size());
        }
        repTree.setNumFolds(2);
        repTree.buildClassifier(similarityData);

        Instances testTrainSimilarity = new Instances(similarityData);
        testTrainSimilarity.clear();
        for (int i = 0; i < traindataSet.size(); i++) {
            double[] similarity = new double[data.numAttributes() + 1];
            for (int k = 0; k < data.numAttributes(); k++) {
                if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) {
                    similarity[k] = 2.0;
                } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) {
                    similarity[k] = 1.0;
                } else {
                    similarity[k] = 0.0;
                }
            }
            testTrainSimilarity.add(new DenseInstance(1.0, similarity));
        }

        int bestScoringProductIndex = -1;
        double maxScore = Double.MIN_VALUE;
        for (int i = 0; i < traindataSet.size(); i++) {
            double score = repTree.classifyInstance(testTrainSimilarity.get(i));
            if (score > maxScore) {
                maxScore = score;
                bestScoringProductIndex = i;
            }
        }
        Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex);
        traindataSet.clear();
        traindataSet.add(bestScoringProduct);
    } catch (Exception e) {
        Console.printerr("failure during DecisionTreeSelection: " + e.getMessage());
        throw new RuntimeException(e);
    }
}