List of usage examples for weka.core Instances deleteAttributeAt
public void deleteAttributeAt(int position)
From source file:com.edwardraff.WekaMNIST.java
License:Open Source License
public static void main(String[] args) throws IOException, Exception { String folder = args[0];// w w w. ja v a2s . c o m String trainPath = folder + "MNISTtrain.arff"; String testPath = folder + "MNISTtest.arff"; System.out.println("Weka Timings"); Instances mnistTrainWeka = new Instances(new BufferedReader(new FileReader(new File(trainPath)))); mnistTrainWeka.setClassIndex(mnistTrainWeka.numAttributes() - 1); Instances mnistTestWeka = new Instances(new BufferedReader(new FileReader(new File(testPath)))); mnistTestWeka.setClassIndex(mnistTestWeka.numAttributes() - 1); //normalize range like into [0, 1] Normalize normalizeFilter = new Normalize(); normalizeFilter.setInputFormat(mnistTrainWeka); mnistTestWeka = Normalize.useFilter(mnistTestWeka, normalizeFilter); mnistTrainWeka = Normalize.useFilter(mnistTrainWeka, normalizeFilter); long start, end; System.out.println("RBF SVM (Full Cache)"); SMO smo = new SMO(); smo.setKernel(new RBFKernel(mnistTrainWeka, 0/*0 causes Weka to cache the whole matrix...*/, 0.015625)); smo.setC(8.0); smo.setBuildLogisticModels(false); evalModel(smo, mnistTrainWeka, mnistTestWeka); System.out.println("RBF SVM (No Cache)"); smo = new SMO(); smo.setKernel(new RBFKernel(mnistTrainWeka, 1, 0.015625)); smo.setC(8.0); smo.setBuildLogisticModels(false); evalModel(smo, mnistTrainWeka, mnistTestWeka); System.out.println("Decision Tree C45"); J48 wekaC45 = new J48(); wekaC45.setUseLaplace(false); wekaC45.setCollapseTree(false); wekaC45.setUnpruned(true); wekaC45.setMinNumObj(2); wekaC45.setUseMDLcorrection(true); evalModel(wekaC45, mnistTrainWeka, mnistTestWeka); System.out.println("Random Forest 50 trees"); int featuresToUse = (int) Math.sqrt(28 * 28);//Weka uses different defaults, so lets make sure they both use the published way RandomForest wekaRF = new RandomForest(); wekaRF.setNumExecutionSlots(1); wekaRF.setMaxDepth(0/*0 for unlimited*/); wekaRF.setNumFeatures(featuresToUse); wekaRF.setNumTrees(50); evalModel(wekaRF, mnistTrainWeka, mnistTestWeka); System.out.println("1-NN (brute)"); IBk wekaNN = new IBk(1); wekaNN.setNearestNeighbourSearchAlgorithm(new LinearNNSearch()); wekaNN.setCrossValidate(false); evalModel(wekaNN, mnistTrainWeka, mnistTestWeka); System.out.println("1-NN (Ball Tree)"); wekaNN = new IBk(1); wekaNN.setNearestNeighbourSearchAlgorithm(new BallTree()); wekaNN.setCrossValidate(false); evalModel(wekaNN, mnistTrainWeka, mnistTestWeka); System.out.println("1-NN (Cover Tree)"); wekaNN = new IBk(1); wekaNN.setNearestNeighbourSearchAlgorithm(new CoverTree()); wekaNN.setCrossValidate(false); evalModel(wekaNN, mnistTrainWeka, mnistTestWeka); System.out.println("Logistic Regression LBFGS lambda = 1e-4"); Logistic logisticLBFGS = new Logistic(); logisticLBFGS.setRidge(1e-4); logisticLBFGS.setMaxIts(500); evalModel(logisticLBFGS, mnistTrainWeka, mnistTestWeka); System.out.println("k-means (Loyd)"); int origClassIndex = mnistTrainWeka.classIndex(); mnistTrainWeka.setClassIndex(-1); mnistTrainWeka.deleteAttributeAt(origClassIndex); { long totalTime = 0; for (int i = 0; i < 10; i++) { SimpleKMeans wekaKMeans = new SimpleKMeans(); wekaKMeans.setNumClusters(10); wekaKMeans.setNumExecutionSlots(1); wekaKMeans.setFastDistanceCalc(true); start = System.currentTimeMillis(); wekaKMeans.buildClusterer(mnistTrainWeka); end = System.currentTimeMillis(); totalTime += (end - start); } System.out.println("\tClustering took: " + (totalTime / 10.0) / 1000.0 + " on average"); } }
From source file:com.relationalcloud.partitioning.explanation.ExplanationHandler.java
License:Open Source License
/** * Repeat the selection from the database removing duplicates, since they will * only increase the execution time. And run the tuples through the classifier * to populate the justifiedpartition column. * /*from w w w . ja v a2 s. c o m*/ * @param tableProcessed * @param classifier * @param wa * @throws SQLException * @throws Exception */ public void populateJustifiedColumn(String tableProcessed, Classifier classifier, ArrayList<String> attributes, Connection conn, int numbPart, Enumeration enumclassvalues) throws SQLException, Exception { if (true) { labelTest(tableProcessed, classifier, conn); return; } tableProcessed = removeQuotes(tableProcessed); // get from the DB the tuples content and their partitioning column String sqlstring = "SELECT distinct g.tupleid, "; for (String sc : attributes) { sqlstring += "s." + sc + ", "; } sqlstring += "g." + pcol + " FROM " + "(SELECT distinct tupleid," + pcol + " FROM `" + testingtable + "` WHERE tableid = '" + tableProcessed + "') AS g, relcloud_" + tableProcessed + " AS s " + "WHERE s.relcloud_id = g.tupleid;"; System.out.println(sqlstring); Statement stmt = conn.createStatement(); // initializing the testing table to avoid complaints from classifier with // an hash partition like distribution if (!testingtable.equals(sampledtrainingtable)) { int i = 0; Object o = enumclassvalues.nextElement(); // set everything to an existing value to ensure that every field is // covered stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=" + o + " WHERE tableid = '" + tableProcessed + "'"); // and than sparkly in a bunch of other values (unsure whether it is // required); while (enumclassvalues.hasMoreElements()) { o = enumclassvalues.nextElement(); // FIXME there might still be an issue in which tupleid%i do not exists, // and thus one of the "o" never appears in the instance... stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=" + o + " WHERE tupleid%" + numbPart + "=" + i + " AND tableid = '" + tableProcessed + "'"); i++; } } ResultSet res = stmt.executeQuery(sqlstring); // create an instance from the resultset Instances data_tupleid = WekaHelper.retrieveInstanceFromResultSetComplete(res, dbPropertyFile); res.close(); data_tupleid.setClassIndex(data_tupleid.numAttributes() - 1); Instances data_no_tupleid = makeLastNominal(data_tupleid); data_no_tupleid.setClassIndex(data_no_tupleid.numAttributes() - 1); // remove tupleid from data_no_tupleid, still available in data_tupleid data_no_tupleid.deleteAttributeAt(0); // if(data_no_tupleid.classAttribute().numValues()>1){ System.out.println("Running the tuples through the classifier to populate " + explainedPartitionCol); // use data that still has the tupleid and newData for the classification Enumeration enum_data_tupleid = data_tupleid.enumerateInstances(); Enumeration enum_data_no_tupleid = data_no_tupleid.enumerateInstances(); PreparedStatement updateJustCol = conn.prepareStatement("UPDATE `" + testingtable + "` SET `" + explainedPartitionCol + "` = ? " + "WHERE tableid = '" + tableProcessed + "' AND tupleid = ?;"); while (enum_data_tupleid.hasMoreElements() && enum_data_no_tupleid.hasMoreElements()) { Instance tupIDinstance = (Instance) enum_data_tupleid.nextElement(); Instance instance = (Instance) enum_data_no_tupleid.nextElement(); double part = classifier.classifyInstance(instance); if (part == Instance.missingValue()) System.err.println("No classification for:" + instance.toString()); updateJustCol.setInt(1, (int) part); updateJustCol.setInt(2, (int) tupIDinstance.value(0)); // System.out.println(tableProcessed+" "+ instance.value(0) + " " + // tupIDinstance.classValue() +" "+ part); updateJustCol.execute(); updateJustCol.clearParameters(); } updateJustCol.close(); }
From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java
License:Open Source License
/** * builds the classifier//ww w.ja va 2 s . c o m * * @param data the training instances * @throws Exception if something goes wrong */ @Override public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // save original header (needed for clusters to classes output) m_OriginalHeader = data.stringFreeStructure(); // remove class attribute for clusterer Instances clusterData = new Instances(data); clusterData.setClassIndex(-1); clusterData.deleteAttributeAt(data.classIndex()); m_ClusteringHeader = clusterData.stringFreeStructure(); if (m_ClusteringHeader.numAttributes() == 0) { System.err.println("Data contains only class attribute, defaulting to ZeroR model."); m_ZeroR = new ZeroR(); m_ZeroR.buildClassifier(data); } else { m_ZeroR = null; // build clusterer m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer); m_ActualClusterer.buildClusterer(clusterData); if (!getLabelAllClusters()) { // determine classes-to-clusters mapping ClusterEvaluation eval = new ClusterEvaluation(); eval.setClusterer(m_ActualClusterer); eval.evaluateClusterer(clusterData); double[] clusterAssignments = eval.getClusterAssignments(); int[][] counts = new int[eval.getNumClusters()][m_OriginalHeader.numClasses()]; int[] clusterTotals = new int[eval.getNumClusters()]; double[] best = new double[eval.getNumClusters() + 1]; double[] current = new double[eval.getNumClusters() + 1]; for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); if (!instance.classIsMissing()) { counts[(int) clusterAssignments[i]][(int) instance.classValue()]++; clusterTotals[(int) clusterAssignments[i]]++; } } best[eval.getNumClusters()] = Double.MAX_VALUE; ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0); m_ClustersToClasses = new double[best.length]; System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length); } else { m_ClusterClassProbs = new double[m_ActualClusterer.numberOfClusters()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance clusterInstance = clusterData.instance(i); Instance originalInstance = data.instance(i); if (!originalInstance.classIsMissing()) { double[] probs = m_ActualClusterer.distributionForInstance(clusterInstance); for (int j = 0; j < probs.length; j++) { m_ClusterClassProbs[j][(int) originalInstance.classValue()] += probs[j]; } } } for (int i = 0; i < m_ClusterClassProbs.length; i++) { Utils.normalize(m_ClusterClassProbs[i]); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.AttributeNonRemoval.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, * org.apache.commons.collections4.list.SetUniqueList) *//*from w w w . j ava 2 s.c om*/ @Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { for (String attributeName : attributeNames) { for (int i = 0; i < testdata.numAttributes(); i++) { if (!attributeName.equals(testdata.attribute(i).name())) { testdata.deleteAttributeAt(i); for (Instances traindata : traindataSet) { traindata.deleteAttributeAt(i); } } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.AttributeNonRemoval.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, * weka.core.Instances)/*from w w w . j ava 2 s .c o m*/ */ @Override public void apply(Instances testdata, Instances traindata) { for (int i = testdata.numAttributes() - 1; i >= 0; i--) { if (!attributeNames.contains(testdata.attribute(i).name())) { testdata.deleteAttributeAt(i); traindata.deleteAttributeAt(i); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java
License:Apache License
/** * <p>// ww w .j a v a2 s . co m * Applies the CLAMI processor to the data. The test data is also required, in order to * guarantee a consistent metric set. * </p> * * @param testdata * test data; the data is not modified, only metrics are dropped * @param data * data to which the CLAMI processor is applied */ public void applyCLAMI(Instances testdata, Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(clusterNumber); // now we filter the metrics int[] numMetricViolations = new int[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { int currentViolations = 0; for (int i = 0; i < data.numInstances(); i++) { Instance currentInstance = data.get(i); if (j != data.classIndex()) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } numMetricViolations[j] = currentViolations; } SortedSet<Integer> distinctViolationCounts = new TreeSet<>(); for (int currentViolations : numMetricViolations) { distinctViolationCounts.add(currentViolations); } Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator(); int violationCutoff = violationCountInterator.next(); // now we filter the data; // this is first tried with the metrics with fewest violations. if no buggy/bugfree // instances remain, this is repeated with the next metrics with second fewest violations, // and so on. // this part is a bit unclear from the description in the paper, but I confirmed with the // author that this is how they implemented it boolean[] cleanInstances = new boolean[data.numInstances()]; int numCleanBuggyInstances = 0; int numCleanBugfreeInstances = 0; do { violationCutoff = violationCountInterator.next(); cleanInstances = new boolean[data.numInstances()]; numCleanBuggyInstances = 0; numCleanBugfreeInstances = 0; for (int i = 0; i < data.numInstances(); i++) { int currentViolations = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } if (currentViolations == 0) { cleanInstances[i] = true; if (clusterNumber[i] > medianClusterNumber) { numCleanBuggyInstances++; } else { numCleanBugfreeInstances++; } } else { cleanInstances[i] = false; } } } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0); // output some interesting information to provide insights into the CLAMI model Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: "); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]); } } // finally modify the instances // drop the metrics (also from the testdata) for (int j = data.numAttributes() - 1; j >= 0; j--) { if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) { data.deleteAttributeAt(j); testdata.deleteAttributeAt(j); } } // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { if (!cleanInstances[i]) { data.delete(i); } else { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java
License:Apache License
/** * <p>//from w w w . j av a 2 s . co m * Applies the synonym pruning based on the training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applySynonymPruning(Instances testdata, Instances traindata) { double distance; for (int j = traindata.numAttributes() - 1; j >= 0; j--) { if (j != traindata.classIndex()) { boolean hasClosest = false; for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) { for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) { if (i1 != i2) { double minVal = Double.MAX_VALUE; double distanceJ = Double.MAX_VALUE; for (int k = 0; k < traindata.numAttributes(); k++) { distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k)); if (distance < minVal) { minVal = distance; } if (k == j) { distanceJ = distance; } } hasClosest = distanceJ <= minVal; } } } if (!hasClosest) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java
License:Apache License
private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet) throws Exception { Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1]; IntStream.range(0, counts.length).forEach(val -> counts[val] = 0); for (Instances traindata : traindataSet) { J48 decisionTree = new J48(); decisionTree.buildClassifier(traindata); int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { if (decisionTree.toString().contains(traindata.attribute(j).name())) { counts[k] = counts[k] + 1; }//w ww . ja v a2 s . c om k++; } } } int[] topkIndex = new int[counts.length]; IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val); SortUtils.quicksort(counts, topkIndex, true); // get CFSs for each training set List<Set<Integer>> cfsSets = new LinkedList<>(); for (Instances traindata : traindataSet) { boolean selectionSuccessful = false; boolean secondAttempt = false; Instances traindataCopy = null; do { try { if (secondAttempt) { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindataCopy); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } else { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindata); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } } catch (IllegalArgumentException e) { String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(e.getMessage()); if (!m.find()) { // cannot treat problem, rethrow exception throw e; } String attributeName = m.group(1); int attrIndex = traindata.attribute(attributeName).index(); if (secondAttempt) { traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex); } else { traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex); } Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training"); secondAttempt = true; continue; } } while (!selectionSuccessful); // dummy loop for internal continue } double[] coverages = new double[topkIndex.length]; for (Set<Integer> cfsSet : cfsSets) { Set<Integer> topkSet = new HashSet<>(); for (int k = 0; k < topkIndex.length; k++) { topkSet.add(topkIndex[k]); coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size()); } } double bestCoverageValue = Double.MIN_VALUE; int bestCoverageIndex = 0; for (int i = 0; i < coverages.length; i++) { if (coverages[i] > bestCoverageValue) { bestCoverageValue = coverages[i]; bestCoverageIndex = i; } } // build correlation matrix SpearmansCorrelation corr = new SpearmansCorrelation(); double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex]; for (Instances traindata : traindataSet) { double[][] vectors = new double[bestCoverageIndex][traindata.size()]; for (int i = 0; i < traindata.size(); i++) { for (int j = 0; j < bestCoverageIndex; j++) { vectors[j][i] = traindata.get(i).value(topkIndex[j]); } } for (int j = 0; j < bestCoverageIndex; j++) { for (int k = j + 1; k < bestCoverageIndex; k++) { correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k])); } } } Set<Integer> topkSetIndexSet = new TreeSet<>(); // j<30 ensures that the computational time does not explode since the powerset is 2^n in // complexity for (int j = 0; j < bestCoverageIndex && j < 30; j++) { topkSetIndexSet.add(j); } Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet); double bestOptCoverage = Double.MIN_VALUE; Set<Integer> opttopkSetIndexSet = null; for (Set<Integer> combination : allCombinations) { if (isUncorrelated(correlationMatrix, combination)) { double currentCoverage = 0.0; Set<Integer> topkCombination = new TreeSet<>(); for (Integer index : combination) { topkCombination.add(topkIndex[index]); } for (Set<Integer> cfsSet : cfsSets) { currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size()); } if (currentCoverage > bestOptCoverage) { bestOptCoverage = currentCoverage; opttopkSetIndexSet = combination; } } } Set<Integer> opttopkIndex = new TreeSet<>(); for (Integer index : opttopkSetIndexSet) { opttopkIndex.add(topkIndex[index]); } Console.traceln(Level.FINE, "selected the following metrics:"); for (Integer index : opttopkIndex) { Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name()); } // finally remove attributes for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex() && !opttopkIndex.contains(j)) { testdata.deleteAttributeAt(j); for (Instances traindata : traindataSet) { traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java
License:Apache License
/** * <p>/*from w w w.j a v a 2 s .c om*/ * Applies TCA to the test and training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applyTCA(Instances testdata, Instances traindata) { final int sizeTest = testdata.numInstances(); final int sizeTrain = traindata.numInstances(); final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata); final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in // the // paper final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the // paper final double mu = 1.0; // default from the MATLAB implementation final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu); PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain); Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")"); final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix) .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix) .multiplyRight(centerMatrix).multiplyRight(kernelMatrix); Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem"); General eigenvalueDecomposition = new JamaEigenvalue.General(); eigenvalueDecomposition.compute(optimizationProblem); Console.traceln(Level.FINEST, "eigenvalue problem solved"); Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues(); System.out.println(eigenvaluesArray.length); final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length]; final int[] index = new int[(int) eigenvaluesArray.length]; // create kernel transformation matrix from eigenvectors for (int i = 0; i < eigenvaluesArray.length; i++) { eigenvalues[i] = eigenvaluesArray.doubleValue(i); index[i] = i; } SortUtils.quicksort(eigenvalues, index); final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight( eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension))); // update testdata and traindata for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex()) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } for (int j = 0; j < reducedDimension; j++) { testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1); traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1); } for (int i = 0; i < sizeTrain; i++) { for (int j = 0; j < reducedDimension; j++) { traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j)); } } for (int i = 0; i < sizeTest; i++) { for (int j = 0; j < reducedDimension; j++) { testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j)); } } }
From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java
License:Apache License
@Override public Instances load(File fileMetricsFile) { // first determine all files String path = fileMetricsFile.getParentFile().getAbsolutePath(); String project = fileMetricsFile.getName().split("_")[0]; File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv"); File networkMetrics = new File(path + "/" + project + "_network_metrics.csv"); Instances metricsData = null;//from w w w . ja v a 2 s . c om try { CSVLoader wekaCsvLoader = new CSVLoader(); wekaCsvLoader.setSource(fileMetricsFile); metricsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(bugsFile); Instances bugsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(networkMetrics); Instances networkData = wekaCsvLoader.getDataSet(); metricsData.setRelationName(project); // fix nominal attributes (i.e., NA values) for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isNominal()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } // fix string attributes for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isString()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } Map<String, Integer> filenames = new HashMap<>(); for (int j = 0; j < metricsData.size(); j++) { filenames.put(metricsData.instance(j).stringValue(0), j); } // merge with network data int attributeIndex; for (int j = 2; j < networkData.numAttributes(); j++) { attributeIndex = metricsData.numAttributes(); metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex); for (int i = 0; i < networkData.size(); i++) { Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, networkData.instance(i).value(j)); } } } // add bug information attributeIndex = metricsData.numAttributes(); final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); metricsData.insertAttributeAt(classAtt, attributeIndex); for (int i = 0; i < bugsData.size(); i++) { if (bugsData.instance(i).value(2) > 0.0d) { Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0); } } } // remove filenames metricsData.deleteAttributeAt(0); Attribute eigenvector = metricsData.attribute("eigenvector"); if (eigenvector != null) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.attribute(j) == eigenvector) { metricsData.deleteAttributeAt(j); } } } metricsData.setClassIndex(metricsData.numAttributes() - 1); // set all missing values to 0 for (int i = 0; i < metricsData.size(); i++) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.instance(i).isMissing(j)) { metricsData.instance(i).setValue(j, 0.0d); } } } } catch (IOException e) { Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage()); metricsData = null; } return metricsData; }