List of usage examples for weka.core Instances size
@Override publicint size()
From source file:controller.NaiveBayesServlet.java
@Override protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { request.setCharacterEncoding("UTF-8"); String dir = "/data/"; String path = getServletContext().getRealPath(dir); String action = request.getParameter("action"); switch (action) { case "create": { String fileName = request.getParameter("file"); String aux = fileName.substring(0, fileName.indexOf(".")); String pathInput = path + "/" + request.getParameter("file"); String pathTrainingOutput = path + "/" + aux + "-training-arff.txt"; String pathTestOutput = path + "/" + aux + "-test-arff.txt"; String pathNaivebayes = path + "/" + aux + "-naiveBayes.txt"; String name = request.getParameter("name"); int range = Integer.parseInt(request.getParameter("range")); int size = Integer.parseInt(request.getParameter("counter")); String[] columns = new String[size]; String[] types = new String[size]; int[] positions = new int[size]; int counter = 0; for (int i = 0; i < size; i++) { if (request.getParameter("column-" + (i + 1)) != null) { columns[counter] = request.getParameter("column-" + (i + 1)); types[counter] = request.getParameter("type-" + (i + 1)); positions[counter] = Integer.parseInt(request.getParameter("position-" + (i + 1))); counter++;/* w w w .java 2 s . c o m*/ } } FormatFiles.convertTxtToArff(pathInput, pathTrainingOutput, pathTestOutput, name, columns, types, positions, counter, range); try { NaiveBayes naiveBayes = new NaiveBayes(); BufferedReader readerTraining = new BufferedReader(new FileReader(pathTrainingOutput)); Instances instancesTraining = new Instances(readerTraining); instancesTraining.setClassIndex(instancesTraining.numAttributes() - 1); naiveBayes.buildClassifier(instancesTraining); BufferedReader readerTest = new BufferedReader(new FileReader(pathTestOutput)); //BufferedReader readerTest = new BufferedReader(new FileReader(pathTrainingOutput)); Instances instancesTest = new Instances(readerTest); instancesTest.setClassIndex(instancesTest.numAttributes() - 1); Evaluation eval = new Evaluation(instancesTraining); eval.evaluateModel(naiveBayes, instancesTest); int corrects = 0; int truePositive = 0; int trueNegative = 0; int falsePositive = 0; int falseNegative = 0; for (int i = 0; i < instancesTest.size(); i++) { Instance instance = instancesTest.get(i); double correctValue = instance.value(instance.attribute(instancesTest.numAttributes() - 1)); double classification = naiveBayes.classifyInstance(instance); if (correctValue == classification) { corrects++; } if (correctValue == 1 && classification == 1) { truePositive++; } if (correctValue == 1 && classification == 0) { falseNegative++; } if (correctValue == 0 && classification == 1) { falsePositive++; } if (correctValue == 0 && classification == 0) { trueNegative++; } } PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(pathNaivebayes, false))); writer.println(naiveBayes.toString()); writer.println(""); writer.println(""); writer.println("Results"); writer.println(eval.toSummaryString()); writer.close(); response.sendRedirect( "NaiveBayes?action=view&corrects=" + corrects + "&totalTest=" + instancesTest.size() + "&totalTrainig=" + instancesTraining.size() + "&range=" + range + "&truePositive=" + truePositive + "&trueNegative=" + trueNegative + "&falsePositive=" + falsePositive + "&falseNegative=" + falseNegative + "&fileName=" + aux + "-naiveBayes.txt"); } catch (Exception e) { System.out.println(e.getMessage()); response.sendRedirect("Navigation?action=naiveBayes"); } break; } default: response.sendError(404); } }
From source file:de.fub.maps.project.detector.model.inference.processhandler.CrossValidationProcessHandler.java
License:Open Source License
private void evaluate(Instances trainingSet) { try {/*from w w w . ja va2 s . c om*/ Evaluation evaluation = new Evaluation(trainingSet); int crossValidationFoldsCount = getCrossValidationFoldsCount(); crossValidationFoldsCount = crossValidationFoldsCount > trainingSet.size() ? trainingSet.size() : crossValidationFoldsCount; evaluation.crossValidateModel(getInferenceModel().getClassifier(), trainingSet, crossValidationFoldsCount, new Random(1)); updateVisualRepresentation(evaluation); } catch (Exception ex) { throw new InferenceModelClassifyException(ex.getMessage(), ex); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.Oversampling.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; if (counts[1] < counts[0]) { Instances negatives = new Instances(traindata); Instances positives = new Instances(traindata); for (int i = traindata.size() - 1; i >= 0; i--) { if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { negatives.remove(i);//from www . ja v a 2 s. co m } if (Double.compare(0.0, positives.get(i).classValue()) == 0) { positives.remove(i); } } Resample resample = new Resample(); resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); try { resample.setInputFormat(traindata); positives = Filter.useFilter(positives, resample); } catch (Exception e) { throw new RuntimeException(e); } traindata.clear(); for (int i = 0; i < negatives.size(); i++) { traindata.add(negatives.get(i)); } for (int i = 0; i < positives.size(); i++) { traindata.add(positives.get(i)); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.Resampling.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Resample resample = new Resample(); resample.setSampleSizePercent(100);//from w w w .ja va2 s . c o m resample.setBiasToUniformClass(1.0); Instances traindataSample; try { resample.setInputFormat(traindata); traindataSample = Filter.useFilter(traindata, resample); } catch (Exception e) { throw new RuntimeException(e); } traindata.clear(); for (int i = 0; i < traindataSample.size(); i++) { traindata.add(traindataSample.get(i)); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java
License:Apache License
/** * <p>// ww w.jav a 2s. c o m * Applies the synonym pruning based on the training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applySynonymPruning(Instances testdata, Instances traindata) { double distance; for (int j = traindata.numAttributes() - 1; j >= 0; j--) { if (j != traindata.classIndex()) { boolean hasClosest = false; for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) { for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) { if (i1 != i2) { double minVal = Double.MAX_VALUE; double distanceJ = Double.MAX_VALUE; for (int k = 0; k < traindata.numAttributes(); k++) { distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k)); if (distance < minVal) { minVal = distance; } if (k == j) { distanceJ = distance; } } hasClosest = distanceJ <= minVal; } } } if (!hasClosest) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java
License:Apache License
private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet) throws Exception { Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1]; IntStream.range(0, counts.length).forEach(val -> counts[val] = 0); for (Instances traindata : traindataSet) { J48 decisionTree = new J48(); decisionTree.buildClassifier(traindata); int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { if (decisionTree.toString().contains(traindata.attribute(j).name())) { counts[k] = counts[k] + 1; }/*w w w . ja v a 2 s . c o m*/ k++; } } } int[] topkIndex = new int[counts.length]; IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val); SortUtils.quicksort(counts, topkIndex, true); // get CFSs for each training set List<Set<Integer>> cfsSets = new LinkedList<>(); for (Instances traindata : traindataSet) { boolean selectionSuccessful = false; boolean secondAttempt = false; Instances traindataCopy = null; do { try { if (secondAttempt) { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindataCopy); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } else { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindata); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } } catch (IllegalArgumentException e) { String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(e.getMessage()); if (!m.find()) { // cannot treat problem, rethrow exception throw e; } String attributeName = m.group(1); int attrIndex = traindata.attribute(attributeName).index(); if (secondAttempt) { traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex); } else { traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex); } Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training"); secondAttempt = true; continue; } } while (!selectionSuccessful); // dummy loop for internal continue } double[] coverages = new double[topkIndex.length]; for (Set<Integer> cfsSet : cfsSets) { Set<Integer> topkSet = new HashSet<>(); for (int k = 0; k < topkIndex.length; k++) { topkSet.add(topkIndex[k]); coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size()); } } double bestCoverageValue = Double.MIN_VALUE; int bestCoverageIndex = 0; for (int i = 0; i < coverages.length; i++) { if (coverages[i] > bestCoverageValue) { bestCoverageValue = coverages[i]; bestCoverageIndex = i; } } // build correlation matrix SpearmansCorrelation corr = new SpearmansCorrelation(); double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex]; for (Instances traindata : traindataSet) { double[][] vectors = new double[bestCoverageIndex][traindata.size()]; for (int i = 0; i < traindata.size(); i++) { for (int j = 0; j < bestCoverageIndex; j++) { vectors[j][i] = traindata.get(i).value(topkIndex[j]); } } for (int j = 0; j < bestCoverageIndex; j++) { for (int k = j + 1; k < bestCoverageIndex; k++) { correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k])); } } } Set<Integer> topkSetIndexSet = new TreeSet<>(); // j<30 ensures that the computational time does not explode since the powerset is 2^n in // complexity for (int j = 0; j < bestCoverageIndex && j < 30; j++) { topkSetIndexSet.add(j); } Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet); double bestOptCoverage = Double.MIN_VALUE; Set<Integer> opttopkSetIndexSet = null; for (Set<Integer> combination : allCombinations) { if (isUncorrelated(correlationMatrix, combination)) { double currentCoverage = 0.0; Set<Integer> topkCombination = new TreeSet<>(); for (Integer index : combination) { topkCombination.add(topkIndex[index]); } for (Set<Integer> cfsSet : cfsSets) { currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size()); } if (currentCoverage > bestOptCoverage) { bestOptCoverage = currentCoverage; opttopkSetIndexSet = combination; } } } Set<Integer> opttopkIndex = new TreeSet<>(); for (Integer index : opttopkSetIndexSet) { opttopkIndex.add(topkIndex[index]); } Console.traceln(Level.FINE, "selected the following metrics:"); for (Integer index : opttopkIndex) { Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name()); } // finally remove attributes for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex() && !opttopkIndex.contains(j)) { testdata.deleteAttributeAt(j); for (Instances traindata : traindataSet) { traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>//from w ww. j a va2 s .c o m * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>/*w w w. ja v a 2 s . c o m*/ * Gets an array with the ranges from the data for a given attribute * </p> * * @param data * the data * @param j * index of the attribute * @return the ranges for the attribute */ private double[] getRanges(Instances data, int j) { double[] values = new double[numRanges + 1]; for (int k = 0; k < numRanges; k++) { values[k] = data.kthSmallestValue(j, (int) (data.size() * (k + 1.0) / numRanges)); } values[numRanges] = data.attributeStats(j).numericStats.max; return values; }
From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, * weka.core.Instances)/*w ww . ja v a2s . co m*/ */ @Override public Instances apply(Instances testdata, Instances traindata) { Instances filteredTraindata = new Instances(traindata); filteredTraindata.clear(); double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1]; int classIndex = testdata.classIndex(); for (int i = 0; i < testdata.size(); i++) { int k = 0; for (int j = 0; j < testdata.numAttributes(); j++) { if (j != classIndex) { data[i][k] = testdata.get(i).value(j); k++; } } } for (int i = 0; i < traindata.size(); i++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != classIndex) { data[i + testdata.size()][k] = traindata.get(i).value(j); k++; } } } DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data); Database db = new StaticArrayDatabase(dbc, null); db.initialize(); DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10); Clustering<Model> clusterer = dbscan.run(db); Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD); int firstInternalIndex = rel.iterDBIDs().internalGetIndex(); for (Cluster<Model> cluster : clusterer.getAllClusters()) { // check if cluster has any training data DBIDIter iter = rel.iterDBIDs(); boolean noMatch = true; for (int i = 0; noMatch && i < testdata.size(); i++) { noMatch = !cluster.getIDs().contains(iter); iter.advance(); } if (!noMatch) { // cluster contains test data for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) { int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex; if (internalIndex >= 0) { // index belongs to a training instance filteredTraindata.add(traindata.get(internalIndex)); } } } } return filteredTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { final Instances data = characteristicInstances(testdata, traindataSet); final ArrayList<String> attVals = new ArrayList<String>(); attVals.add("same"); attVals.add("more"); attVals.add("less"); final ArrayList<Attribute> atts = new ArrayList<Attribute>(); for (int j = 0; j < data.numAttributes(); j++) { atts.add(new Attribute(data.attribute(j).name(), attVals)); }/*from w w w . ja v a 2 s . c om*/ atts.add(new Attribute("score")); Instances similarityData = new Instances("similarity", atts, 0); similarityData.setClassIndex(similarityData.numAttributes() - 1); try { Classifier classifier = new J48(); for (int i = 0; i < traindataSet.size(); i++) { classifier.buildClassifier(traindataSet.get(i)); for (int j = 0; j < traindataSet.size(); j++) { if (i != j) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } Evaluation eval = new Evaluation(traindataSet.get(j)); eval.evaluateModel(classifier, traindataSet.get(j)); similarity[data.numAttributes()] = eval.fMeasure(1); similarityData.add(new DenseInstance(1.0, similarity)); } } } REPTree repTree = new REPTree(); if (repTree.getNumFolds() > similarityData.size()) { repTree.setNumFolds(similarityData.size()); } repTree.setNumFolds(2); repTree.buildClassifier(similarityData); Instances testTrainSimilarity = new Instances(similarityData); testTrainSimilarity.clear(); for (int i = 0; i < traindataSet.size(); i++) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } testTrainSimilarity.add(new DenseInstance(1.0, similarity)); } int bestScoringProductIndex = -1; double maxScore = Double.MIN_VALUE; for (int i = 0; i < traindataSet.size(); i++) { double score = repTree.classifyInstance(testTrainSimilarity.get(i)); if (score > maxScore) { maxScore = score; bestScoringProductIndex = i; } } Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex); traindataSet.clear(); traindataSet.add(bestScoringProduct); } catch (Exception e) { Console.printerr("failure during DecisionTreeSelection: " + e.getMessage()); throw new RuntimeException(e); } }