Example usage for weka.core Instances deleteAttributeAt

Introduction

In this page you can find the example usage for weka.core Instances deleteAttributeAt.

Prototype



public void deleteAttributeAt(int position)

Source Link

Document

Deletes an attribute at the given position (0 to numAttributes() - 1).

Usage

From source file:com.edwardraff.WekaMNIST.java

License:Open Source License

public static void main(String[] args) throws IOException, Exception {
    String folder = args[0];// w  w  w.  ja v a2s . c o m
    String trainPath = folder + "MNISTtrain.arff";
    String testPath = folder + "MNISTtest.arff";

    System.out.println("Weka Timings");
    Instances mnistTrainWeka = new Instances(new BufferedReader(new FileReader(new File(trainPath))));
    mnistTrainWeka.setClassIndex(mnistTrainWeka.numAttributes() - 1);
    Instances mnistTestWeka = new Instances(new BufferedReader(new FileReader(new File(testPath))));
    mnistTestWeka.setClassIndex(mnistTestWeka.numAttributes() - 1);

    //normalize range like into [0, 1]
    Normalize normalizeFilter = new Normalize();
    normalizeFilter.setInputFormat(mnistTrainWeka);

    mnistTestWeka = Normalize.useFilter(mnistTestWeka, normalizeFilter);
    mnistTrainWeka = Normalize.useFilter(mnistTrainWeka, normalizeFilter);

    long start, end;

    System.out.println("RBF SVM (Full Cache)");
    SMO smo = new SMO();
    smo.setKernel(new RBFKernel(mnistTrainWeka, 0/*0 causes Weka to cache the whole matrix...*/, 0.015625));
    smo.setC(8.0);
    smo.setBuildLogisticModels(false);
    evalModel(smo, mnistTrainWeka, mnistTestWeka);

    System.out.println("RBF SVM (No Cache)");
    smo = new SMO();
    smo.setKernel(new RBFKernel(mnistTrainWeka, 1, 0.015625));
    smo.setC(8.0);
    smo.setBuildLogisticModels(false);
    evalModel(smo, mnistTrainWeka, mnistTestWeka);

    System.out.println("Decision Tree C45");
    J48 wekaC45 = new J48();
    wekaC45.setUseLaplace(false);
    wekaC45.setCollapseTree(false);
    wekaC45.setUnpruned(true);
    wekaC45.setMinNumObj(2);
    wekaC45.setUseMDLcorrection(true);

    evalModel(wekaC45, mnistTrainWeka, mnistTestWeka);

    System.out.println("Random Forest 50 trees");
    int featuresToUse = (int) Math.sqrt(28 * 28);//Weka uses different defaults, so lets make sure they both use the published way

    RandomForest wekaRF = new RandomForest();
    wekaRF.setNumExecutionSlots(1);
    wekaRF.setMaxDepth(0/*0 for unlimited*/);
    wekaRF.setNumFeatures(featuresToUse);
    wekaRF.setNumTrees(50);

    evalModel(wekaRF, mnistTrainWeka, mnistTestWeka);

    System.out.println("1-NN (brute)");
    IBk wekaNN = new IBk(1);
    wekaNN.setNearestNeighbourSearchAlgorithm(new LinearNNSearch());
    wekaNN.setCrossValidate(false);

    evalModel(wekaNN, mnistTrainWeka, mnistTestWeka);

    System.out.println("1-NN (Ball Tree)");
    wekaNN = new IBk(1);
    wekaNN.setNearestNeighbourSearchAlgorithm(new BallTree());
    wekaNN.setCrossValidate(false);

    evalModel(wekaNN, mnistTrainWeka, mnistTestWeka);

    System.out.println("1-NN (Cover Tree)");
    wekaNN = new IBk(1);
    wekaNN.setNearestNeighbourSearchAlgorithm(new CoverTree());
    wekaNN.setCrossValidate(false);

    evalModel(wekaNN, mnistTrainWeka, mnistTestWeka);

    System.out.println("Logistic Regression LBFGS lambda = 1e-4");
    Logistic logisticLBFGS = new Logistic();
    logisticLBFGS.setRidge(1e-4);
    logisticLBFGS.setMaxIts(500);

    evalModel(logisticLBFGS, mnistTrainWeka, mnistTestWeka);

    System.out.println("k-means (Loyd)");
    int origClassIndex = mnistTrainWeka.classIndex();
    mnistTrainWeka.setClassIndex(-1);
    mnistTrainWeka.deleteAttributeAt(origClassIndex);
    {
        long totalTime = 0;
        for (int i = 0; i < 10; i++) {
            SimpleKMeans wekaKMeans = new SimpleKMeans();
            wekaKMeans.setNumClusters(10);
            wekaKMeans.setNumExecutionSlots(1);
            wekaKMeans.setFastDistanceCalc(true);

            start = System.currentTimeMillis();
            wekaKMeans.buildClusterer(mnistTrainWeka);
            end = System.currentTimeMillis();
            totalTime += (end - start);
        }
        System.out.println("\tClustering took: " + (totalTime / 10.0) / 1000.0 + " on average");
    }
}

From source file:com.relationalcloud.partitioning.explanation.ExplanationHandler.java

License:Open Source License

/**
 * Repeat the selection from the database removing duplicates, since they will
 * only increase the execution time. And run the tuples through the classifier
 * to populate the justifiedpartition column.
 * /*from  w w w . ja  v a2  s. c o  m*/
 * @param tableProcessed
 * @param classifier
 * @param wa
 * @throws SQLException
 * @throws Exception
 */
public void populateJustifiedColumn(String tableProcessed, Classifier classifier, ArrayList<String> attributes,
        Connection conn, int numbPart, Enumeration enumclassvalues) throws SQLException, Exception {
    if (true) {
        labelTest(tableProcessed, classifier, conn);
        return;
    }

    tableProcessed = removeQuotes(tableProcessed);

    // get from the DB the tuples content and their partitioning column
    String sqlstring = "SELECT distinct g.tupleid, ";
    for (String sc : attributes) {
        sqlstring += "s." + sc + ", ";
    }
    sqlstring += "g." + pcol + " FROM " + "(SELECT distinct tupleid," + pcol + " FROM `" + testingtable
            + "` WHERE tableid = '" + tableProcessed + "') AS g, relcloud_" + tableProcessed + " AS s "
            + "WHERE s.relcloud_id = g.tupleid;";

    System.out.println(sqlstring);
    Statement stmt = conn.createStatement();

    // initializing the testing table to avoid complaints from classifier with
    // an hash partition like distribution
    if (!testingtable.equals(sampledtrainingtable)) {
        int i = 0;

        Object o = enumclassvalues.nextElement();

        // set everything to an existing value to ensure that every field is
        // covered
        stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=" + o + " WHERE tableid = '"
                + tableProcessed + "'");
        // and than sparkly in a bunch of other values (unsure whether it is
        // required);
        while (enumclassvalues.hasMoreElements()) {
            o = enumclassvalues.nextElement();

            // FIXME there might still be an issue in which tupleid%i do not exists,
            // and thus one of the "o" never appears in the instance...
            stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=" + o + " WHERE tupleid%"
                    + numbPart + "=" + i + " AND tableid = '" + tableProcessed + "'");
            i++;
        }
    }

    ResultSet res = stmt.executeQuery(sqlstring);
    // create an instance from the resultset
    Instances data_tupleid = WekaHelper.retrieveInstanceFromResultSetComplete(res, dbPropertyFile);
    res.close();

    data_tupleid.setClassIndex(data_tupleid.numAttributes() - 1);
    Instances data_no_tupleid = makeLastNominal(data_tupleid);
    data_no_tupleid.setClassIndex(data_no_tupleid.numAttributes() - 1);
    // remove tupleid from data_no_tupleid, still available in data_tupleid
    data_no_tupleid.deleteAttributeAt(0);

    // if(data_no_tupleid.classAttribute().numValues()>1){
    System.out.println("Running the tuples through the classifier to populate " + explainedPartitionCol);

    // use data that still has the tupleid and newData for the classification
    Enumeration enum_data_tupleid = data_tupleid.enumerateInstances();
    Enumeration enum_data_no_tupleid = data_no_tupleid.enumerateInstances();

    PreparedStatement updateJustCol = conn.prepareStatement("UPDATE `" + testingtable + "` SET `"
            + explainedPartitionCol + "` = ? " + "WHERE tableid = '" + tableProcessed + "' AND tupleid = ?;");

    while (enum_data_tupleid.hasMoreElements() && enum_data_no_tupleid.hasMoreElements()) {

        Instance tupIDinstance = (Instance) enum_data_tupleid.nextElement();
        Instance instance = (Instance) enum_data_no_tupleid.nextElement();

        double part = classifier.classifyInstance(instance);
        if (part == Instance.missingValue())
            System.err.println("No classification for:" + instance.toString());
        updateJustCol.setInt(1, (int) part);
        updateJustCol.setInt(2, (int) tupIDinstance.value(0));

        // System.out.println(tableProcessed+" "+ instance.value(0) + " " +
        // tupIDinstance.classValue() +" "+ part);

        updateJustCol.execute();
        updateJustCol.clearParameters();

    }

    updateJustCol.close();

}

From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java

License:Open Source License

/**
 * builds the classifier//ww  w.ja  va  2 s . c o m
 * 
 * @param data the training instances
 * @throws Exception if something goes wrong
 */
@Override
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // save original header (needed for clusters to classes output)
    m_OriginalHeader = data.stringFreeStructure();

    // remove class attribute for clusterer
    Instances clusterData = new Instances(data);
    clusterData.setClassIndex(-1);
    clusterData.deleteAttributeAt(data.classIndex());
    m_ClusteringHeader = clusterData.stringFreeStructure();

    if (m_ClusteringHeader.numAttributes() == 0) {
        System.err.println("Data contains only class attribute, defaulting to ZeroR model.");
        m_ZeroR = new ZeroR();
        m_ZeroR.buildClassifier(data);
    } else {
        m_ZeroR = null;

        // build clusterer
        m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer);
        m_ActualClusterer.buildClusterer(clusterData);

        if (!getLabelAllClusters()) {

            // determine classes-to-clusters mapping
            ClusterEvaluation eval = new ClusterEvaluation();
            eval.setClusterer(m_ActualClusterer);
            eval.evaluateClusterer(clusterData);
            double[] clusterAssignments = eval.getClusterAssignments();
            int[][] counts = new int[eval.getNumClusters()][m_OriginalHeader.numClasses()];
            int[] clusterTotals = new int[eval.getNumClusters()];
            double[] best = new double[eval.getNumClusters() + 1];
            double[] current = new double[eval.getNumClusters() + 1];
            for (int i = 0; i < data.numInstances(); i++) {
                Instance instance = data.instance(i);
                if (!instance.classIsMissing()) {
                    counts[(int) clusterAssignments[i]][(int) instance.classValue()]++;
                    clusterTotals[(int) clusterAssignments[i]]++;
                }
            }
            best[eval.getNumClusters()] = Double.MAX_VALUE;
            ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0);
            m_ClustersToClasses = new double[best.length];
            System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length);
        } else {
            m_ClusterClassProbs = new double[m_ActualClusterer.numberOfClusters()][data.numClasses()];
            for (int i = 0; i < data.numInstances(); i++) {
                Instance clusterInstance = clusterData.instance(i);
                Instance originalInstance = data.instance(i);
                if (!originalInstance.classIsMissing()) {
                    double[] probs = m_ActualClusterer.distributionForInstance(clusterInstance);
                    for (int j = 0; j < probs.length; j++) {
                        m_ClusterClassProbs[j][(int) originalInstance.classValue()] += probs[j];
                    }
                }
            }
            for (int i = 0; i < m_ClusterClassProbs.length; i++) {
                Utils.normalize(m_ClusterClassProbs[i]);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.AttributeNonRemoval.java

License:Apache License

/**
 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances,
 *      org.apache.commons.collections4.list.SetUniqueList)
 *//*from  w  w  w .  j  ava  2 s.c om*/
@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    for (String attributeName : attributeNames) {
        for (int i = 0; i < testdata.numAttributes(); i++) {
            if (!attributeName.equals(testdata.attribute(i).name())) {
                testdata.deleteAttributeAt(i);
                for (Instances traindata : traindataSet) {
                    traindata.deleteAttributeAt(i);
                }
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.AttributeNonRemoval.java

License:Apache License

/**
 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances,
 *      weka.core.Instances)/*from w  w  w  . j  ava 2  s  .c o  m*/
 */
@Override
public void apply(Instances testdata, Instances traindata) {
    for (int i = testdata.numAttributes() - 1; i >= 0; i--) {
        if (!attributeNames.contains(testdata.attribute(i).name())) {
            testdata.deleteAttributeAt(i);
            traindata.deleteAttributeAt(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java

License:Apache License

/**
 * <p>// ww w .j a  v a2  s . co  m
 * Applies the CLAMI processor to the data. The test data is also required, in order to
 * guarantee a consistent metric set.
 * </p>
 *
 * @param testdata
 *            test data; the data is not modified, only metrics are dropped
 * @param data
 *            data to which the CLAMI processor is applied
 */
public void applyCLAMI(Instances testdata, Instances data) {

    // first determine medians
    double[] medians = new double[data.numAttributes()];
    // get medians
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
        }
    }
    // now determine cluster number for each instance
    double[] clusterNumber = new double[data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
        int countHighValues = 0;
        Instance currentInstance = data.get(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (j != data.classIndex()) {
                if (currentInstance.value(j) > medians[j]) {
                    countHighValues++;
                }
            }
        }
        clusterNumber[i] = countHighValues;
    }

    // determine median of cluster number
    Median m = new Median();
    double medianClusterNumber = m.evaluate(clusterNumber);

    // now we filter the metrics
    int[] numMetricViolations = new int[data.numAttributes()];
    for (int j = 0; j < data.numAttributes(); j++) {
        int currentViolations = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            Instance currentInstance = data.get(i);
            if (j != data.classIndex()) {
                if (clusterNumber[i] > medianClusterNumber) {
                    // "buggy"
                    if (currentInstance.value(j) <= medians[j]) {
                        currentViolations++;
                    }
                } else {
                    // "not buggy"
                    if (currentInstance.value(j) > medians[j]) {
                        currentViolations++;
                    }
                }
            }
        }
        numMetricViolations[j] = currentViolations;
    }

    SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
    for (int currentViolations : numMetricViolations) {
        distinctViolationCounts.add(currentViolations);
    }
    Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();

    int violationCutoff = violationCountInterator.next();
    // now we filter the data;
    // this is first tried with the metrics with fewest violations. if no buggy/bugfree
    // instances remain, this is repeated with the next metrics with second fewest violations,
    // and so on.
    // this part is a bit unclear from the description in the paper, but I confirmed with the
    // author that this is how they implemented it
    boolean[] cleanInstances = new boolean[data.numInstances()];
    int numCleanBuggyInstances = 0;
    int numCleanBugfreeInstances = 0;
    do {
        violationCutoff = violationCountInterator.next();
        cleanInstances = new boolean[data.numInstances()];
        numCleanBuggyInstances = 0;
        numCleanBugfreeInstances = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            int currentViolations = 0;
            Instance currentInstance = data.get(i);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
                    if (clusterNumber[i] > medianClusterNumber) {
                        // "buggy"
                        if (currentInstance.value(j) <= medians[j]) {
                            currentViolations++;
                        }
                    } else {
                        // "not buggy"
                        if (currentInstance.value(j) > medians[j]) {
                            currentViolations++;
                        }
                    }
                }
            }
            if (currentViolations == 0) {
                cleanInstances[i] = true;
                if (clusterNumber[i] > medianClusterNumber) {
                    numCleanBuggyInstances++;
                } else {
                    numCleanBugfreeInstances++;
                }
            } else {
                cleanInstances[i] = false;
            }
        }
    } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0);

    // output some interesting information to provide insights into the CLAMI model
    Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
            Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
        }
    }

    // finally modify the instances
    // drop the metrics (also from the testdata)
    for (int j = data.numAttributes() - 1; j >= 0; j--) {
        if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
            data.deleteAttributeAt(j);
            testdata.deleteAttributeAt(j);
        }
    }
    // drop the unclean instances
    for (int i = data.numInstances() - 1; i >= 0; i--) {
        if (!cleanInstances[i]) {
            data.delete(i);
        } else {
            // set the classification
            if (clusterNumber[i] > medianClusterNumber) {
                data.get(i).setClassValue(1.0d);
            } else {
                data.get(i).setClassValue(0.0d);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java

License:Apache License

/**
 * <p>//from   w  w  w  . j av  a  2  s  .  co  m
 * Applies the synonym pruning based on the training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applySynonymPruning(Instances testdata, Instances traindata) {
    double distance;
    for (int j = traindata.numAttributes() - 1; j >= 0; j--) {
        if (j != traindata.classIndex()) {
            boolean hasClosest = false;
            for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) {
                for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        double minVal = Double.MAX_VALUE;
                        double distanceJ = Double.MAX_VALUE;
                        for (int k = 0; k < traindata.numAttributes(); k++) {
                            distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k));
                            if (distance < minVal) {
                                minVal = distance;
                            }
                            if (k == j) {
                                distanceJ = distance;
                            }
                        }
                        hasClosest = distanceJ <= minVal;
                    }
                }
            }
            if (!hasClosest) {
                testdata.deleteAttributeAt(j);
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java

License:Apache License

private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet)
        throws Exception {
    Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1];
    IntStream.range(0, counts.length).forEach(val -> counts[val] = 0);
    for (Instances traindata : traindataSet) {
        J48 decisionTree = new J48();
        decisionTree.buildClassifier(traindata);
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                if (decisionTree.toString().contains(traindata.attribute(j).name())) {
                    counts[k] = counts[k] + 1;
                }//w  ww  .  ja v a2 s .  c om
                k++;
            }
        }
    }
    int[] topkIndex = new int[counts.length];
    IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val);
    SortUtils.quicksort(counts, topkIndex, true);

    // get CFSs for each training set
    List<Set<Integer>> cfsSets = new LinkedList<>();
    for (Instances traindata : traindataSet) {
        boolean selectionSuccessful = false;
        boolean secondAttempt = false;
        Instances traindataCopy = null;
        do {
            try {
                if (secondAttempt) {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindataCopy);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                } else {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindata);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                }
            } catch (IllegalArgumentException e) {
                String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*";
                Pattern p = Pattern.compile(regex);
                Matcher m = p.matcher(e.getMessage());
                if (!m.find()) {
                    // cannot treat problem, rethrow exception
                    throw e;
                }
                String attributeName = m.group(1);
                int attrIndex = traindata.attribute(attributeName).index();
                if (secondAttempt) {
                    traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex);
                } else {
                    traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex);
                }
                Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training");
                secondAttempt = true;
                continue;
            }
        } while (!selectionSuccessful); // dummy loop for internal continue
    }

    double[] coverages = new double[topkIndex.length];
    for (Set<Integer> cfsSet : cfsSets) {
        Set<Integer> topkSet = new HashSet<>();
        for (int k = 0; k < topkIndex.length; k++) {
            topkSet.add(topkIndex[k]);
            coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size());
        }
    }
    double bestCoverageValue = Double.MIN_VALUE;
    int bestCoverageIndex = 0;
    for (int i = 0; i < coverages.length; i++) {
        if (coverages[i] > bestCoverageValue) {
            bestCoverageValue = coverages[i];
            bestCoverageIndex = i;
        }
    }
    // build correlation matrix
    SpearmansCorrelation corr = new SpearmansCorrelation();
    double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex];
    for (Instances traindata : traindataSet) {
        double[][] vectors = new double[bestCoverageIndex][traindata.size()];
        for (int i = 0; i < traindata.size(); i++) {
            for (int j = 0; j < bestCoverageIndex; j++) {
                vectors[j][i] = traindata.get(i).value(topkIndex[j]);
            }
        }
        for (int j = 0; j < bestCoverageIndex; j++) {
            for (int k = j + 1; k < bestCoverageIndex; k++) {
                correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k]));
            }
        }
    }
    Set<Integer> topkSetIndexSet = new TreeSet<>();
    // j<30 ensures that the computational time does not explode since the powerset is 2^n in
    // complexity
    for (int j = 0; j < bestCoverageIndex && j < 30; j++) {
        topkSetIndexSet.add(j);
    }
    Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet);
    double bestOptCoverage = Double.MIN_VALUE;
    Set<Integer> opttopkSetIndexSet = null;
    for (Set<Integer> combination : allCombinations) {
        if (isUncorrelated(correlationMatrix, combination)) {
            double currentCoverage = 0.0;
            Set<Integer> topkCombination = new TreeSet<>();
            for (Integer index : combination) {
                topkCombination.add(topkIndex[index]);
            }
            for (Set<Integer> cfsSet : cfsSets) {
                currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size());
            }
            if (currentCoverage > bestOptCoverage) {
                bestOptCoverage = currentCoverage;
                opttopkSetIndexSet = combination;
            }
        }
    }
    Set<Integer> opttopkIndex = new TreeSet<>();
    for (Integer index : opttopkSetIndexSet) {
        opttopkIndex.add(topkIndex[index]);
    }
    Console.traceln(Level.FINE, "selected the following metrics:");
    for (Integer index : opttopkIndex) {
        Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name());
    }
    // finally remove attributes
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex() && !opttopkIndex.contains(j)) {
            testdata.deleteAttributeAt(j);
            for (Instances traindata : traindataSet) {
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>/*from  w w  w.j a v a  2 s  .c om*/
 * Applies TCA to the test and training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applyTCA(Instances testdata, Instances traindata) {
    final int sizeTest = testdata.numInstances();
    final int sizeTrain = traindata.numInstances();
    final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata);
    final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in
                                                                                         // the
                                                                                         // paper
    final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the
                                                                                 // paper
    final double mu = 1.0; // default from the MATLAB implementation
    final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu);
    PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain);

    Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")");
    final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix)
            .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix)
            .multiplyRight(centerMatrix).multiplyRight(kernelMatrix);
    Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem");
    General eigenvalueDecomposition = new JamaEigenvalue.General();
    eigenvalueDecomposition.compute(optimizationProblem);
    Console.traceln(Level.FINEST, "eigenvalue problem solved");

    Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues();
    System.out.println(eigenvaluesArray.length);
    final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length];
    final int[] index = new int[(int) eigenvaluesArray.length];
    // create kernel transformation matrix from eigenvectors
    for (int i = 0; i < eigenvaluesArray.length; i++) {
        eigenvalues[i] = eigenvaluesArray.doubleValue(i);
        index[i] = i;
    }
    SortUtils.quicksort(eigenvalues, index);

    final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight(
            eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension)));

    // update testdata and traindata
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex()) {
            testdata.deleteAttributeAt(j);
            traindata.deleteAttributeAt(j);
        }
    }
    for (int j = 0; j < reducedDimension; j++) {
        testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
        traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
    }
    for (int i = 0; i < sizeTrain; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j));
        }
    }
    for (int i = 0; i < sizeTest; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j));
        }
    }
}

From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java

License:Apache License

@Override
public Instances load(File fileMetricsFile) {
    // first determine all files
    String path = fileMetricsFile.getParentFile().getAbsolutePath();
    String project = fileMetricsFile.getName().split("_")[0];
    File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
    File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
    Instances metricsData = null;//from   w w w  . ja v  a 2  s  .  c om

    try {
        CSVLoader wekaCsvLoader = new CSVLoader();
        wekaCsvLoader.setSource(fileMetricsFile);
        metricsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(bugsFile);
        Instances bugsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(networkMetrics);
        Instances networkData = wekaCsvLoader.getDataSet();

        metricsData.setRelationName(project);

        // fix nominal attributes (i.e., NA values)
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isNominal()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }
        // fix string attributes
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isString()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }

        Map<String, Integer> filenames = new HashMap<>();
        for (int j = 0; j < metricsData.size(); j++) {
            filenames.put(metricsData.instance(j).stringValue(0), j);
        }
        // merge with network data
        int attributeIndex;
        for (int j = 2; j < networkData.numAttributes(); j++) {
            attributeIndex = metricsData.numAttributes();
            metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
            for (int i = 0; i < networkData.size(); i++) {
                Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex,
                            networkData.instance(i).value(j));
                }
            }
        }

        // add bug information
        attributeIndex = metricsData.numAttributes();
        final ArrayList<String> classAttVals = new ArrayList<String>();
        classAttVals.add("0");
        classAttVals.add("1");
        final Attribute classAtt = new Attribute("bug", classAttVals);
        metricsData.insertAttributeAt(classAtt, attributeIndex);
        for (int i = 0; i < bugsData.size(); i++) {
            if (bugsData.instance(i).value(2) > 0.0d) {
                Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
                }
            }
        }

        // remove filenames
        metricsData.deleteAttributeAt(0);
        Attribute eigenvector = metricsData.attribute("eigenvector");
        if (eigenvector != null) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.attribute(j) == eigenvector) {
                    metricsData.deleteAttributeAt(j);
                }
            }
        }

        metricsData.setClassIndex(metricsData.numAttributes() - 1);

        // set all missing values to 0
        for (int i = 0; i < metricsData.size(); i++) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.instance(i).isMissing(j)) {
                    metricsData.instance(i).setValue(j, 0.0d);
                }
            }
        }
    } catch (IOException e) {
        Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
        metricsData = null;
    }
    return metricsData;
}