List of usage examples for weka.core Instances numAttributes
publicint numAttributes()
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>//from www. j ava 2 s .co m * Z-Score normalization using the mean and std of the training data (N3 in Transfer Defect * Learning by Nam et al.). * </p> * * @param testdata * test data of the target product * @param traindata * training data */ public static void zScoreTraining(Instances testdata, Instances traindata) { final double[] mean = new double[testdata.numAttributes()]; final double[] std = new double[testdata.numAttributes()]; // get means of training for (int j = 0; j < traindata.numAttributes(); j++) { if (traindata.classIndex() != j) { mean[j] = traindata.meanOrMode(j); std[j] = Math.sqrt(traindata.variance(j)); } } applyZScore(testdata, mean, std); applyZScore(traindata, mean, std); }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/* ww w. jav a 2s . c o m*/ * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning * by Nam et al.). * </p> * * @param testdata * test data of the target product * @param traindata * training data */ public static void zScoreTarget(Instances testdata, Instances traindata) { final double[] mean = new double[testdata.numAttributes()]; final double[] std = new double[testdata.numAttributes()]; // get means of testdata for (int j = 0; j < testdata.numAttributes(); j++) { if (testdata.classIndex() != j) { mean[j] = testdata.meanOrMode(j); std[j] = Math.sqrt(testdata.variance(j)); } } applyZScore(testdata, mean, std); applyZScore(traindata, mean, std); }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/*from w ww . java2 s . c o m*/ * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning * by Nam et al.). * </p> * * @param testdata * test data of the target product * @param traindataSet * training data */ public static void zScoreTarget(Instances testdata, SetUniqueList<Instances> traindataSet) { final double[] mean = new double[testdata.numAttributes()]; final double[] std = new double[testdata.numAttributes()]; // get means of testdata for (int j = 0; j < testdata.numAttributes(); j++) { if (testdata.classIndex() != j) { mean[j] = testdata.meanOrMode(j); std[j] = Math.sqrt(testdata.variance(j)); } } applyZScore(testdata, mean, std); for (Instances traindata : traindataSet) { applyZScore(traindata, mean, std); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/*from ww w . j a va 2 s . co m*/ * Internal helper function * </p> */ private static void applyZScore(Instances data, double[] mean, double[] std) { for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { instance.setValue(j, instance.value(j) - mean[j] / std[j]); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java
License:Apache License
/** * <p>//from www . j av a2 s . co m * Applies the synonym pruning based on the training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applySynonymPruning(Instances testdata, Instances traindata) { double distance; for (int j = traindata.numAttributes() - 1; j >= 0; j--) { if (j != traindata.classIndex()) { boolean hasClosest = false; for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) { for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) { if (i1 != i2) { double minVal = Double.MAX_VALUE; double distanceJ = Double.MAX_VALUE; for (int k = 0; k < traindata.numAttributes(); k++) { distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k)); if (distance < minVal) { minVal = distance; } if (k == j) { distanceJ = distance; } } hasClosest = distanceJ <= minVal; } } } if (!hasClosest) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java
License:Apache License
private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet) throws Exception { Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1]; IntStream.range(0, counts.length).forEach(val -> counts[val] = 0); for (Instances traindata : traindataSet) { J48 decisionTree = new J48(); decisionTree.buildClassifier(traindata); int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { if (decisionTree.toString().contains(traindata.attribute(j).name())) { counts[k] = counts[k] + 1; }/* ww w . j a va 2 s . c o m*/ k++; } } } int[] topkIndex = new int[counts.length]; IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val); SortUtils.quicksort(counts, topkIndex, true); // get CFSs for each training set List<Set<Integer>> cfsSets = new LinkedList<>(); for (Instances traindata : traindataSet) { boolean selectionSuccessful = false; boolean secondAttempt = false; Instances traindataCopy = null; do { try { if (secondAttempt) { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindataCopy); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } else { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindata); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } } catch (IllegalArgumentException e) { String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(e.getMessage()); if (!m.find()) { // cannot treat problem, rethrow exception throw e; } String attributeName = m.group(1); int attrIndex = traindata.attribute(attributeName).index(); if (secondAttempt) { traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex); } else { traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex); } Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training"); secondAttempt = true; continue; } } while (!selectionSuccessful); // dummy loop for internal continue } double[] coverages = new double[topkIndex.length]; for (Set<Integer> cfsSet : cfsSets) { Set<Integer> topkSet = new HashSet<>(); for (int k = 0; k < topkIndex.length; k++) { topkSet.add(topkIndex[k]); coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size()); } } double bestCoverageValue = Double.MIN_VALUE; int bestCoverageIndex = 0; for (int i = 0; i < coverages.length; i++) { if (coverages[i] > bestCoverageValue) { bestCoverageValue = coverages[i]; bestCoverageIndex = i; } } // build correlation matrix SpearmansCorrelation corr = new SpearmansCorrelation(); double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex]; for (Instances traindata : traindataSet) { double[][] vectors = new double[bestCoverageIndex][traindata.size()]; for (int i = 0; i < traindata.size(); i++) { for (int j = 0; j < bestCoverageIndex; j++) { vectors[j][i] = traindata.get(i).value(topkIndex[j]); } } for (int j = 0; j < bestCoverageIndex; j++) { for (int k = j + 1; k < bestCoverageIndex; k++) { correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k])); } } } Set<Integer> topkSetIndexSet = new TreeSet<>(); // j<30 ensures that the computational time does not explode since the powerset is 2^n in // complexity for (int j = 0; j < bestCoverageIndex && j < 30; j++) { topkSetIndexSet.add(j); } Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet); double bestOptCoverage = Double.MIN_VALUE; Set<Integer> opttopkSetIndexSet = null; for (Set<Integer> combination : allCombinations) { if (isUncorrelated(correlationMatrix, combination)) { double currentCoverage = 0.0; Set<Integer> topkCombination = new TreeSet<>(); for (Integer index : combination) { topkCombination.add(topkIndex[index]); } for (Set<Integer> cfsSet : cfsSets) { currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size()); } if (currentCoverage > bestOptCoverage) { bestOptCoverage = currentCoverage; opttopkSetIndexSet = combination; } } } Set<Integer> opttopkIndex = new TreeSet<>(); for (Integer index : opttopkSetIndexSet) { opttopkIndex.add(topkIndex[index]); } Console.traceln(Level.FINE, "selected the following metrics:"); for (Integer index : opttopkIndex) { Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name()); } // finally remove attributes for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex() && !opttopkIndex.contains(j)) { testdata.deleteAttributeAt(j); for (Instances traindata : traindataSet) { traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java
License:Apache License
/** * <p>//from ww w.j a va2s .com * Applies TCA to the test and training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applyTCA(Instances testdata, Instances traindata) { final int sizeTest = testdata.numInstances(); final int sizeTrain = traindata.numInstances(); final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata); final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in // the // paper final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the // paper final double mu = 1.0; // default from the MATLAB implementation final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu); PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain); Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")"); final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix) .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix) .multiplyRight(centerMatrix).multiplyRight(kernelMatrix); Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem"); General eigenvalueDecomposition = new JamaEigenvalue.General(); eigenvalueDecomposition.compute(optimizationProblem); Console.traceln(Level.FINEST, "eigenvalue problem solved"); Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues(); System.out.println(eigenvaluesArray.length); final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length]; final int[] index = new int[(int) eigenvaluesArray.length]; // create kernel transformation matrix from eigenvectors for (int i = 0; i < eigenvaluesArray.length; i++) { eigenvalues[i] = eigenvaluesArray.doubleValue(i); index[i] = i; } SortUtils.quicksort(eigenvalues, index); final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight( eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension))); // update testdata and traindata for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex()) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } for (int j = 0; j < reducedDimension; j++) { testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1); traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1); } for (int i = 0; i < sizeTrain; i++) { for (int j = 0; j < reducedDimension; j++) { traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j)); } } for (int i = 0; i < sizeTest; i++) { for (int j = 0; j < reducedDimension; j++) { testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j)); } } }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>//from w w w . jav a 2 s.c om * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, * weka.core.Instances)/*from w w w.j ava 2 s . c o m*/ */ @Override public Instances apply(Instances testdata, Instances traindata) { Instances filteredTraindata = new Instances(traindata); filteredTraindata.clear(); double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1]; int classIndex = testdata.classIndex(); for (int i = 0; i < testdata.size(); i++) { int k = 0; for (int j = 0; j < testdata.numAttributes(); j++) { if (j != classIndex) { data[i][k] = testdata.get(i).value(j); k++; } } } for (int i = 0; i < traindata.size(); i++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != classIndex) { data[i + testdata.size()][k] = traindata.get(i).value(j); k++; } } } DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data); Database db = new StaticArrayDatabase(dbc, null); db.initialize(); DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10); Clustering<Model> clusterer = dbscan.run(db); Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD); int firstInternalIndex = rel.iterDBIDs().internalGetIndex(); for (Cluster<Model> cluster : clusterer.getAllClusters()) { // check if cluster has any training data DBIDIter iter = rel.iterDBIDs(); boolean noMatch = true; for (int i = 0; noMatch && i < testdata.size(); i++) { noMatch = !cluster.getIDs().contains(iter); iter.advance(); } if (!noMatch) { // cluster contains test data for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) { int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex; if (internalIndex >= 0) { // index belongs to a training instance filteredTraindata.add(traindata.get(internalIndex)); } } } } return filteredTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { final Instances data = characteristicInstances(testdata, traindataSet); final ArrayList<String> attVals = new ArrayList<String>(); attVals.add("same"); attVals.add("more"); attVals.add("less"); final ArrayList<Attribute> atts = new ArrayList<Attribute>(); for (int j = 0; j < data.numAttributes(); j++) { atts.add(new Attribute(data.attribute(j).name(), attVals)); }/*from w w w . j a v a2s .c o m*/ atts.add(new Attribute("score")); Instances similarityData = new Instances("similarity", atts, 0); similarityData.setClassIndex(similarityData.numAttributes() - 1); try { Classifier classifier = new J48(); for (int i = 0; i < traindataSet.size(); i++) { classifier.buildClassifier(traindataSet.get(i)); for (int j = 0; j < traindataSet.size(); j++) { if (i != j) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } Evaluation eval = new Evaluation(traindataSet.get(j)); eval.evaluateModel(classifier, traindataSet.get(j)); similarity[data.numAttributes()] = eval.fMeasure(1); similarityData.add(new DenseInstance(1.0, similarity)); } } } REPTree repTree = new REPTree(); if (repTree.getNumFolds() > similarityData.size()) { repTree.setNumFolds(similarityData.size()); } repTree.setNumFolds(2); repTree.buildClassifier(similarityData); Instances testTrainSimilarity = new Instances(similarityData); testTrainSimilarity.clear(); for (int i = 0; i < traindataSet.size(); i++) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } testTrainSimilarity.add(new DenseInstance(1.0, similarity)); } int bestScoringProductIndex = -1; double maxScore = Double.MIN_VALUE; for (int i = 0; i < traindataSet.size(); i++) { double score = repTree.classifyInstance(testTrainSimilarity.get(i)); if (score > maxScore) { maxScore = score; bestScoringProductIndex = i; } } Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex); traindataSet.clear(); traindataSet.add(bestScoringProduct); } catch (Exception e) { Console.printerr("failure during DecisionTreeSelection: " + e.getMessage()); throw new RuntimeException(e); } }