List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:de.fub.maps.project.detector.model.inference.processhandler.InferenceDataProcessHandler.java
License:Open Source License
@Override protected void handle() { clearResults();/*from ww w .j ava 2s. c o m*/ Classifier classifier = getInferenceModel().getClassifier(); HashSet<TrackSegment> inferenceDataSet = getInferenceDataSet(); Collection<Attribute> attributeList = getInferenceModel().getAttributes(); if (!attributeList.isEmpty()) { Set<String> keySet = getInferenceModel().getInput().getTrainingsSet().keySet(); setClassesToView(keySet); Instances unlabeledInstances = new Instances("Unlabeld Tracks", new ArrayList<Attribute>(attributeList), 0); //NO18N unlabeledInstances.setClassIndex(0); ArrayList<TrackSegment> segmentList = new ArrayList<TrackSegment>(); for (TrackSegment segment : inferenceDataSet) { Instance instance = getInstance(segment); unlabeledInstances.add(instance); segmentList.add(segment); } // create copy Instances labeledInstances = new Instances(unlabeledInstances); for (int index = 0; index < labeledInstances.numInstances(); index++) { try { Instance instance = labeledInstances.instance(index); // classify instance double classifyed = classifier.classifyInstance(instance); instance.setClassValue(classifyed); // get class label String value = unlabeledInstances.classAttribute().value((int) classifyed); if (index < segmentList.size()) { instanceToTrackSegmentMap.put(instance, segmentList.get(index)); } // put label and instance to result map put(value, instance); } catch (Exception ex) { Exceptions.printStackTrace(ex); } } // update visw updateVisualRepresentation(); // update result set of the inferenceModel for (Entry<String, List<Instance>> entry : resultMap.entrySet()) { HashSet<TrackSegment> trackSegmentList = new HashSet<TrackSegment>(); for (Instance instance : entry.getValue()) { TrackSegment trackSegment = instanceToTrackSegmentMap.get(instance); if (trackSegment != null) { trackSegmentList.add(trackSegment); } } // only those classes are put into the result data set, which are not empty if (!trackSegmentList.isEmpty()) { getInferenceModel().getResult().put(entry.getKey(), trackSegmentList); } } } else { throw new InferenceModelClassifyException(MessageFormat .format("No attributes available. Attribute list lengeth == {0}", attributeList.size())); } resultMap.clear(); instanceToTrackSegmentMap.clear(); }
From source file:de.fub.maps.project.detector.model.inference.processhandler.SpecialInferenceDataProcessHandler.java
License:Open Source License
@Override protected void handle() { clearResults();//from ww w . j ava 2s . co m Classifier classifier = getInferenceModel().getClassifier(); Collection<Attribute> attributeList = getInferenceModel().getAttributes(); if (!attributeList.isEmpty()) { Set<String> keySet = getInferenceModel().getInput().getTrainingsSet().keySet(); setClassesToView(keySet); Instances unlabeledInstances = new Instances("Unlabeld Tracks", new ArrayList<Attribute>(attributeList), 0); //NO18N unlabeledInstances.setClassIndex(0); ArrayList<TrackSegment> segmentList = new ArrayList<TrackSegment>(); for (Entry<String, HashSet<TrackSegment>> entry : getInferenceModel().getInput().getTrainingsSet() .entrySet()) { for (TrackSegment segment : entry.getValue()) { segment.setLabel(entry.getKey()); Instance instance = getInstance(segment); unlabeledInstances.add(instance); segmentList.add(segment); } } // create copy Instances labeledInstances = new Instances(unlabeledInstances); for (int index = 0; index < labeledInstances.numInstances(); index++) { try { Instance instance = labeledInstances.instance(index); // classify instance double classifyed = classifier.classifyInstance(instance); instance.setClassValue(classifyed); // get class label String value = unlabeledInstances.classAttribute().value((int) classifyed); if (index < segmentList.size()) { instanceToTrackSegmentMap.put(instance, segmentList.get(index)); } // put label and instance to result map put(value, instance); } catch (Exception ex) { Exceptions.printStackTrace(ex); } } // update visw updateVisualRepresentation(); // update result set of the inferenceModel for (Map.Entry<String, List<Instance>> entry : resultMap.entrySet()) { HashSet<TrackSegment> trackSegmentList = new HashSet<TrackSegment>(); for (Instance instance : entry.getValue()) { TrackSegment trackSegment = instanceToTrackSegmentMap.get(instance); if (trackSegment != null) { trackSegmentList.add(trackSegment); } } // only those classes are put into the result data set, which are not empty if (!trackSegmentList.isEmpty()) { getInferenceModel().getResult().put(entry.getKey(), trackSegmentList); } } } else { throw new InferenceModelClassifyException(MessageFormat .format("No attributes available. Attribute list lengeth == {0}", attributeList.size())); } resultMap.clear(); instanceToTrackSegmentMap.clear(); }
From source file:de.fub.maps.project.detector.model.inference.processhandler.TrainingsDataProcessHandler.java
License:Open Source License
@Override protected void handle() { final ProgressHandle handle = ProgressHandleFactory.createHandle("Trainings"); try {// w w w. j a v a2s .c o m handle.start(); Collection<Attribute> attributeCollection = getInferenceModel().getAttributes(); ArrayList<Attribute> arrayList = new ArrayList<Attribute>(attributeCollection); Instances trainingSet = new Instances("Classes", arrayList, 0); trainingSet.setClassIndex(0); Instances testingSet = new Instances("Classes", arrayList, 0); testingSet.setClassIndex(0); HashMap<String, HashSet<TrackSegment>> dataset = getInferenceModel().getInput().getTrainingsSet(); int datasetCount = 0; for (HashSet<TrackSegment> list : dataset.values()) { for (TrackSegment trackSegment : list) { datasetCount += trackSegment.getWayPointList().size(); } } handle.switchToDeterminate(datasetCount); int trackCount = 0; for (Entry<String, HashSet<TrackSegment>> entry : dataset.entrySet()) { int trainingsSetSize = (int) Math.ceil(entry.getValue().size() * getTrainingsSetRatioParameter()); int index = 0; for (TrackSegment trackSegment : entry.getValue()) { Instance instance = getInstance(entry.getKey(), trackSegment); if (index < trainingsSetSize) { trainingSet.add(instance); } else { testingSet.add(instance); } handle.progress(trackCount++); index++; } } assert trainingSet.numInstances() > 0 : "Training set is empty and has no instances"; //NO18N assert testingSet.numInstances() > 0 : "Testing set is empty and has no instances"; //NO18N handle.switchToIndeterminate(); evaluate(trainingSet, testingSet); } finally { handle.finish(); } }
From source file:de.tudarmstadt.ukp.alignment.framework.combined.WekaMachineLearning.java
License:Apache License
/** * * This method applies a serialized WEKA model file to an unlabeld .arff file for classification * * * @param input_arff the annotated gold standard in an .arff file * @param model output file for the model * @param output output file for evaluation of trained classifier (10-fold cross validation) * @throws Exception/* w w w . j a v a2 s. c o m*/ */ public static void applyModelToUnlabeledArff(String input_arff, String model, String output) throws Exception { DataSource source = new DataSource(input_arff); Instances unlabeled = source.getDataSet(); if (unlabeled.classIndex() == -1) { unlabeled.setClassIndex(unlabeled.numAttributes() - 1); } Remove rm = new Remove(); rm.setAttributeIndices("1"); // remove ID attribute ObjectInputStream ois = new ObjectInputStream(new FileInputStream(model)); Classifier cls = (Classifier) ois.readObject(); ois.close(); // create copy Instances labeled = new Instances(unlabeled); // label instances for (int i = 0; i < unlabeled.numInstances(); i++) { double clsLabel = cls.classifyInstance(unlabeled.instance(i)); labeled.instance(i).setClassValue(clsLabel); } // save labeled data BufferedWriter writer = new BufferedWriter(new FileWriter(output)); writer.write(labeled.toString()); writer.newLine(); writer.flush(); writer.close(); }
From source file:de.tudarmstadt.ukp.similarity.experiments.coling2012.util.Evaluator.java
License:Open Source License
public static void runClassifierCV(WekaClassifier wekaClassifier, Dataset dataset) throws Exception { // Set parameters int folds = 10; Classifier baseClassifier = getClassifier(wekaClassifier); // Set up the random number generator long seed = new Date().getTime(); Random random = new Random(seed); // Add IDs to the instances AddID.main(new String[] { "-i", MODELS_DIR + "/" + dataset.toString() + ".arff", "-o", MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff" }); Instances data = DataSource.read(MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff"); data.setClassIndex(data.numAttributes() - 1); // Instantiate the Remove filter Remove removeIDFilter = new Remove(); removeIDFilter.setAttributeIndices("first"); // Randomize the data data.randomize(random);/*from w w w . j a v a2 s. c om*/ // Perform cross-validation Instances predictedData = null; Evaluation eval = new Evaluation(data); for (int n = 0; n < folds; n++) { Instances train = data.trainCV(folds, n, random); Instances test = data.testCV(folds, n); // Apply log filter // Filter logFilter = new LogFilter(); // logFilter.setInputFormat(train); // train = Filter.useFilter(train, logFilter); // logFilter.setInputFormat(test); // test = Filter.useFilter(test, logFilter); // Copy the classifier Classifier classifier = AbstractClassifier.makeCopy(baseClassifier); // Instantiate the FilteredClassifier FilteredClassifier filteredClassifier = new FilteredClassifier(); filteredClassifier.setFilter(removeIDFilter); filteredClassifier.setClassifier(classifier); // Build the classifier filteredClassifier.buildClassifier(train); // Evaluate eval.evaluateModel(filteredClassifier, test); // Add predictions AddClassification filter = new AddClassification(); filter.setClassifier(filteredClassifier); filter.setOutputClassification(true); filter.setOutputDistribution(false); filter.setOutputErrorFlag(true); filter.setInputFormat(train); Filter.useFilter(train, filter); // trains the classifier Instances pred = Filter.useFilter(test, filter); // performs predictions on test set if (predictedData == null) predictedData = new Instances(pred, 0); for (int j = 0; j < pred.numInstances(); j++) predictedData.add(pred.instance(j)); } // Prepare output classification String[] scores = new String[predictedData.numInstances()]; for (Instance predInst : predictedData) { int id = new Double(predInst.value(predInst.attribute(0))).intValue() - 1; int valueIdx = predictedData.numAttributes() - 2; String value = predInst.stringValue(predInst.attribute(valueIdx)); scores[id] = value; } // Output StringBuilder sb = new StringBuilder(); for (String score : scores) sb.append(score.toString() + LF); FileUtils.writeStringToFile( new File(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/output.csv"), sb.toString()); }
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java
License:Apache License
/** * <p>//from w w w . j a v a 2 s .c o m * Applies the CLAMI processor to the data. The test data is also required, in order to * guarantee a consistent metric set. * </p> * * @param testdata * test data; the data is not modified, only metrics are dropped * @param data * data to which the CLAMI processor is applied */ public void applyCLAMI(Instances testdata, Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(clusterNumber); // now we filter the metrics int[] numMetricViolations = new int[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { int currentViolations = 0; for (int i = 0; i < data.numInstances(); i++) { Instance currentInstance = data.get(i); if (j != data.classIndex()) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } numMetricViolations[j] = currentViolations; } SortedSet<Integer> distinctViolationCounts = new TreeSet<>(); for (int currentViolations : numMetricViolations) { distinctViolationCounts.add(currentViolations); } Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator(); int violationCutoff = violationCountInterator.next(); // now we filter the data; // this is first tried with the metrics with fewest violations. if no buggy/bugfree // instances remain, this is repeated with the next metrics with second fewest violations, // and so on. // this part is a bit unclear from the description in the paper, but I confirmed with the // author that this is how they implemented it boolean[] cleanInstances = new boolean[data.numInstances()]; int numCleanBuggyInstances = 0; int numCleanBugfreeInstances = 0; do { violationCutoff = violationCountInterator.next(); cleanInstances = new boolean[data.numInstances()]; numCleanBuggyInstances = 0; numCleanBugfreeInstances = 0; for (int i = 0; i < data.numInstances(); i++) { int currentViolations = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } if (currentViolations == 0) { cleanInstances[i] = true; if (clusterNumber[i] > medianClusterNumber) { numCleanBuggyInstances++; } else { numCleanBugfreeInstances++; } } else { cleanInstances[i] = false; } } } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0); // output some interesting information to provide insights into the CLAMI model Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: "); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]); } } // finally modify the instances // drop the metrics (also from the testdata) for (int j = data.numAttributes() - 1; j >= 0; j--) { if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) { data.deleteAttributeAt(j); testdata.deleteAttributeAt(j); } } // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { if (!cleanInstances[i]) { data.delete(i); } else { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAProcessor.java
License:Apache License
/** * Applies the CLA processor the the data. * //from w ww . j a v a2s.c o m * @param data * data to which the processor is applied */ public void applyCLA(Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(Arrays.stream(clusterNumber).distinct().toArray()); // finally modify the instances // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * /* w w w . j a va 2s . c o m*/ * <p> * Applies MORPH to the data * </p> * * @param data * data to which the processor is applied */ public void applyMORPH(Instances data) { for (int i = 0; i < data.numInstances(); i++) { morphInstance(data.get(i), data); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * <p>//from www .ja va 2 s . c o m * Determines the nearest unlike neighbor of an instance. * </p> * * @param instance * instance to which the nearest unlike neighbor is determined * @param data * data where the nearest unlike neighbor is determined from * @return nearest unlike instance */ public Instance getNearestUnlikeNeighbor(Instance instance, Instances data) { Instance nearestUnlikeNeighbor = null; double[] instanceVector = new double[data.numAttributes() - 1]; int tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { instanceVector[tmp] = instance.value(j); } } double minDistance = Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { if (instance.classValue() != data.instance(i).classValue()) { double[] otherVector = new double[data.numAttributes() - 1]; tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { otherVector[tmp++] = data.instance(i).value(j); } } if (MathArrays.distance(instanceVector, otherVector) < minDistance) { minDistance = MathArrays.distance(instanceVector, otherVector); nearestUnlikeNeighbor = data.instance(i); } } } return nearestUnlikeNeighbor; }
From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { int indexOfConfidenceAttribute = -1; // Find index of the named confidence attribute to filter for for (int i = 0; i < traindata.numAttributes(); i++) { if (traindata.attribute(i).name().equals(nominalAttributeName)) { indexOfConfidenceAttribute = i; }//from w w w.jav a2 s. c o m } // if it was not found return if (indexOfConfidenceAttribute == -1) { return; } // Find index of nominal values Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections .list(confidenceAttribute.enumerateValues()); ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { for (String attributeValue : nominalAttributeValues) { if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { indexOfnominalAttributeValues.add((double) k); } } } // Go through all instances and check if nominal attribute equals for (int j = traindata.numInstances() - 1; j >= 0; j--) { Instance wekaInstance = traindata.get(j); // delete all instances where nominal attribute has the value of one of the parameter if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { traindata.delete(j); } } }