Example usage for weka.core Instance value

Introduction

In this page you can find the example usage for weka.core Instance value.

Prototype

public double value(Attribute att);

Source Link

Document

Returns an instance's attribute value in internal format.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java

License:Apache License

/**
 * <p>/* ww w. j av a  2  s  .c  o m*/
 * Determines the nearest unlike neighbor of an instance.
 * </p>
 *
 * @param instance
 *            instance to which the nearest unlike neighbor is determined
 * @param data
 *            data where the nearest unlike neighbor is determined from
 * @return nearest unlike instance
 */
public Instance getNearestUnlikeNeighbor(Instance instance, Instances data) {
    Instance nearestUnlikeNeighbor = null;

    double[] instanceVector = new double[data.numAttributes() - 1];
    int tmp = 0;
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) {
            instanceVector[tmp] = instance.value(j);
        }
    }

    double minDistance = Double.MAX_VALUE;
    for (int i = 0; i < data.numInstances(); i++) {
        if (instance.classValue() != data.instance(i).classValue()) {
            double[] otherVector = new double[data.numAttributes() - 1];
            tmp = 0;
            for (int j = 0; j < data.numAttributes(); j++) {
                if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) {
                    otherVector[tmp++] = data.instance(i).value(j);
                }
            }
            if (MathArrays.distance(instanceVector, otherVector) < minDistance) {
                minDistance = MathArrays.distance(instanceVector, otherVector);
                nearestUnlikeNeighbor = data.instance(i);
            }
        }
    }
    return nearestUnlikeNeighbor;
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    int indexOfConfidenceAttribute = -1;

    // Find index of the named confidence attribute to filter for
    for (int i = 0; i < traindata.numAttributes(); i++) {
        if (traindata.attribute(i).name().equals(nominalAttributeName)) {
            indexOfConfidenceAttribute = i;
        }//from  w w w . j ava  2  s.  c o m
    }

    // if it was not found return
    if (indexOfConfidenceAttribute == -1) {
        return;
    }

    // Find index of nominal values
    Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute);
    ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections
            .list(confidenceAttribute.enumerateValues());
    ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>();

    for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) {
        for (String attributeValue : nominalAttributeValues) {
            if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) {
                indexOfnominalAttributeValues.add((double) k);
            }
        }
    }

    // Go through all instances and check if nominal attribute equals
    for (int j = traindata.numInstances() - 1; j >= 0; j--) {
        Instance wekaInstance = traindata.get(j);

        // delete all instances where nominal attribute has the value of one of the parameter
        if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) {
            traindata.delete(j);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/* w w w  .  j a v a2 s .  c  o m*/
 * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param data
 *            data that is normalized
 */
public static void minMax(Instances data) {
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.classIndex() != j) {
            double min = data.attributeStats(j).numericStats.min;
            double max = data.attributeStats(j).numericStats.max;

            for (int i = 0; i < data.numInstances(); i++) {
                Instance inst = data.instance(i);
                double newValue = (inst.value(j) - min) / (max - min);
                inst.setValue(j, newValue);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>//from   www .j av a2  s . c  o  m
 * Internal helper function
 * </p>
 */
private static void applyZScore(Instances data, double[] mean, double[] std) {
    for (int i = 0; i < data.numInstances(); i++) {
        Instance instance = data.instance(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (data.classIndex() != j) {
                instance.setValue(j, instance.value(j) - mean[j] / std[j]);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    Instances newDataSet = new Instances(traindata);
    traindata.delete();/*from w w w  .j  a  v  a2  s. c om*/

    HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>();

    // This is to add all data, where the first occurence of the file has a bug
    ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>();

    // Sort dataset (StateID is connected to the date of commit: Lower StateID
    // means earlier commit than a higher stateID)
    Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID");
    newDataSet.sort(wekaAttribute);

    /*
     * Logical summary: If there is an instance that dont have a bug, put it into the hashmap
     * (only unique values in there)
     * 
     * If there is an instance, that hava a bug look up if it is in the hashmap already (this
     * means: it does not had a bug before!): If this is true add it to a new dataset and remove
     * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be
     * found.
     * 
     * If the instance has a bug and is not in the hashmap (this means: The file has a bug with
     * its first occurence or this file only has bugs and not an instance with no bug), then (if
     * it is not in the arrayList above) add it to the new dataset. This way it is possible to
     * get the first occurence of a file, which has a bug
     */
    for (int i = 0; i < newDataSet.numInstances(); i++) {
        Instance wekaInstance = newDataSet.instance(i);

        double newBugLabel = wekaInstance.classValue();
        Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name");
        Double artifactName = wekaInstance.value(wekaArtifactName);

        if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) {
            traindata.add(wekaInstance);
            artifactNames.remove(artifactName);
        } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) {
            if (!firstOccurenceArtifactNames.contains(artifactName)) {
                traindata.add(wekaInstance);
                firstOccurenceArtifactNames.add(artifactName);
            }
        }
    }

    // If we have a file, that never had a bug (this is, when it is NOT in the
    // new created dataset, but it is in the HashMap from above) add it to
    // the new dataset

    double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0);
    HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames);

    for (Double artifactName : artifactNames.keySet()) {

        for (int i = 0; i < artifactNamesinNewDataSet.length; i++) {
            if (artifactNamesinNewDataSet[i] == artifactName) {
                artifactNamesCopy.remove(artifactName);
            }
        }
    }

    for (Double artifact : artifactNamesCopy.keySet()) {
        traindata.add(artifactNamesCopy.get(artifact));
    }

}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>/*ww w.j a v  a2s .c  om*/
 * calculates the linear kernel function between two instances
 * </p>
 *
 * @param x1
 *            first instance
 * @param x2
 *            second instance
 * @return kernel value
 */
private double linearKernel(Instance x1, Instance x2) {
    double value = 0.0d;
    for (int j = 0; j < x1.numAttributes(); j++) {
        if (j != x1.classIndex()) {
            value += x1.value(j) * x2.value(j);
        }
    }
    return value;
}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>//from   w  w w  . j  a  v  a2 s .c o  m
 * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
 * metric values.
 * </p>
 *
 * @param inst1
 *            first instance to be compared
 * @param inst2
 *            second instance to be compared
 * @return the distance
 */
public static double hammingDistance(Instance inst1, Instance inst2) {
    double distance = 0.0;
    for (int j = 0; j < inst1.numAttributes(); j++) {
        if (j != inst1.classIndex()) {
            if (inst1.value(j) != inst2.value(j)) {
                distance += 1.0;
            }
        }
    }
    return distance;
}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>//from  w  ww.j  a v  a 2  s .c om
 * Returns a double array of the values without the classification.
 * </p>
 *
 * @param instance
 *            the instance
 * @return double array
 */
public static double[] instanceValues(Instance instance) {
    double[] values = new double[instance.numAttributes() - 1];
    int k = 0;
    for (int j = 0; j < instance.numAttributes(); j++) {
        if (j != instance.classIndex()) {
            values[k] = instance.value(j);
            k++;
        }
    }
    return values;
}

From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java

License:Open Source License

private double distance(Instance i1, Instance i2) {
    double dist = 0.0;
    for (int i = 0; i < i1.numAttributes(); i++) {
        dist += Math.abs(i1.value(i) - i2.value(i));
    }//from   w w  w .ja v  a 2 s . c om
    return dist / i1.numAttributes();
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }//www .  ja va  2  s .co  m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext);

    ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>();
    for (Integer clusterId : clusterMap.keySet()) {
        System.out.println("CLUSTER: " + clusterId);
        for (Integer offset : clusterMap.get(clusterId)) {

            // get instance ID from instance
            Instance instance = copyTrainData.get(offset);

            Double classOffset = new Double(instance.value(copyTrainData.classAttribute()));
            String label = (String) trainOutcomeValues.get(classOffset.intValue());

            clusterAssignments.addSample(clusterId, label);

            String instanceId = instance
                    .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index());
            System.out.println(label + "\t" + instanceId2TextMap.get(instanceId));
        }
        System.out.println();
    }

    System.out.println("ID\tSIZE\tPURITY\tRMSE");
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId);
        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        String purityString = String.format("%.2f", purity);
        double rmse = getRMSE(fd, trainOutcomeValues);
        String rmseString = String.format("%.2f", rmse);
        System.out.println(
                clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString);
    }
    System.out.println();
}