List of usage examples for weka.core Instance value
public double value(Attribute att);
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * <p>/* ww w. j av a 2 s .c o m*/ * Determines the nearest unlike neighbor of an instance. * </p> * * @param instance * instance to which the nearest unlike neighbor is determined * @param data * data where the nearest unlike neighbor is determined from * @return nearest unlike instance */ public Instance getNearestUnlikeNeighbor(Instance instance, Instances data) { Instance nearestUnlikeNeighbor = null; double[] instanceVector = new double[data.numAttributes() - 1]; int tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { instanceVector[tmp] = instance.value(j); } } double minDistance = Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { if (instance.classValue() != data.instance(i).classValue()) { double[] otherVector = new double[data.numAttributes() - 1]; tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { otherVector[tmp++] = data.instance(i).value(j); } } if (MathArrays.distance(instanceVector, otherVector) < minDistance) { minDistance = MathArrays.distance(instanceVector, otherVector); nearestUnlikeNeighbor = data.instance(i); } } } return nearestUnlikeNeighbor; }
From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { int indexOfConfidenceAttribute = -1; // Find index of the named confidence attribute to filter for for (int i = 0; i < traindata.numAttributes(); i++) { if (traindata.attribute(i).name().equals(nominalAttributeName)) { indexOfConfidenceAttribute = i; }//from w w w . j ava 2 s. c o m } // if it was not found return if (indexOfConfidenceAttribute == -1) { return; } // Find index of nominal values Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections .list(confidenceAttribute.enumerateValues()); ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { for (String attributeValue : nominalAttributeValues) { if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { indexOfnominalAttributeValues.add((double) k); } } } // Go through all instances and check if nominal attribute equals for (int j = traindata.numInstances() - 1; j >= 0; j--) { Instance wekaInstance = traindata.get(j); // delete all instances where nominal attribute has the value of one of the parameter if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { traindata.delete(j); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/* w w w . j a v a2 s . c o m*/ * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning * by Nam et al.). * </p> * * @param data * data that is normalized */ public static void minMax(Instances data) { for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { double min = data.attributeStats(j).numericStats.min; double max = data.attributeStats(j).numericStats.max; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); double newValue = (inst.value(j) - min) / (max - min); inst.setValue(j, newValue); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>//from www .j av a2 s . c o m * Internal helper function * </p> */ private static void applyZScore(Instances data, double[] mean, double[] std) { for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { instance.setValue(j, instance.value(j) - mean[j] / std[j]); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Instances newDataSet = new Instances(traindata); traindata.delete();/*from w w w .j a v a2 s. c om*/ HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); // This is to add all data, where the first occurence of the file has a bug ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); // Sort dataset (StateID is connected to the date of commit: Lower StateID // means earlier commit than a higher stateID) Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); newDataSet.sort(wekaAttribute); /* * Logical summary: If there is an instance that dont have a bug, put it into the hashmap * (only unique values in there) * * If there is an instance, that hava a bug look up if it is in the hashmap already (this * means: it does not had a bug before!): If this is true add it to a new dataset and remove * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be * found. * * If the instance has a bug and is not in the hashmap (this means: The file has a bug with * its first occurence or this file only has bugs and not an instance with no bug), then (if * it is not in the arrayList above) add it to the new dataset. This way it is possible to * get the first occurence of a file, which has a bug */ for (int i = 0; i < newDataSet.numInstances(); i++) { Instance wekaInstance = newDataSet.instance(i); double newBugLabel = wekaInstance.classValue(); Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); Double artifactName = wekaInstance.value(wekaArtifactName); if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { traindata.add(wekaInstance); artifactNames.remove(artifactName); } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { if (!firstOccurenceArtifactNames.contains(artifactName)) { traindata.add(wekaInstance); firstOccurenceArtifactNames.add(artifactName); } } } // If we have a file, that never had a bug (this is, when it is NOT in the // new created dataset, but it is in the HashMap from above) add it to // the new dataset double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); for (Double artifactName : artifactNames.keySet()) { for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { if (artifactNamesinNewDataSet[i] == artifactName) { artifactNamesCopy.remove(artifactName); } } } for (Double artifact : artifactNamesCopy.keySet()) { traindata.add(artifactNamesCopy.get(artifact)); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java
License:Apache License
/** * <p>/*ww w.j a v a2s .c om*/ * calculates the linear kernel function between two instances * </p> * * @param x1 * first instance * @param x2 * second instance * @return kernel value */ private double linearKernel(Instance x1, Instance x2) { double value = 0.0d; for (int j = 0; j < x1.numAttributes(); j++) { if (j != x1.classIndex()) { value += x1.value(j) * x2.value(j); } } return value; }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>//from w w w . j a v a2 s .c o m * Adoption of the Hamming difference to numerical values, i.e., basically a count of different * metric values. * </p> * * @param inst1 * first instance to be compared * @param inst2 * second instance to be compared * @return the distance */ public static double hammingDistance(Instance inst1, Instance inst2) { double distance = 0.0; for (int j = 0; j < inst1.numAttributes(); j++) { if (j != inst1.classIndex()) { if (inst1.value(j) != inst2.value(j)) { distance += 1.0; } } } return distance; }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>//from w ww.j a v a 2 s .c om * Returns a double array of the values without the classification. * </p> * * @param instance * the instance * @return double array */ public static double[] instanceValues(Instance instance) { double[] values = new double[instance.numAttributes() - 1]; int k = 0; for (int j = 0; j < instance.numAttributes(); j++) { if (j != instance.classIndex()) { values[k] = instance.value(j); k++; } } return values; }
From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java
License:Open Source License
private double distance(Instance i1, Instance i2) { double dist = 0.0; for (int i = 0; i < i1.numAttributes(); i++) { dist += Math.abs(i1.value(i) - i2.value(i)); }//from w w w .ja v a 2 s . c om return dist / i1.numAttributes(); }
From source file:de.unidue.langtech.grading.tc.ClusteringTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//www . ja va 2 s .co m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext); ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { System.out.println("CLUSTER: " + clusterId); for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = copyTrainData.get(offset); Double classOffset = new Double(instance.value(copyTrainData.classAttribute())); String label = (String) trainOutcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); String instanceId = instance .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index()); System.out.println(label + "\t" + instanceId2TextMap.get(instanceId)); } System.out.println(); } System.out.println("ID\tSIZE\tPURITY\tRMSE"); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); String purityString = String.format("%.2f", purity); double rmse = getRMSE(fd, trainOutcomeValues); String rmseString = String.format("%.2f", rmse); System.out.println( clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString); } System.out.println(); }