Example usage for weka.core Instances classIndex

List of usage examples for weka.core Instances classIndex

Introduction

In this page you can find the example usage for weka.core Instances classIndex.

Prototype


publicint classIndex() 

Source Link

Document

Returns the class attribute's index.

Usage

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>/*w  w w.  j av  a  2 s  .  com*/
 * Applies the CLIFF relevancy filter to the data.
 * </p>
 *
 * @param data
 *            the data
 * @return CLIFF-filtered data
 */
protected Instances applyCLIFF(Instances data) {
    final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
    final double[] powerEntity = new double[data.size()];

    final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
    final double probDefect = data.numInstances() / (double) counts[1];

    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute()) {
            final double[] ranges = getRanges(data, j);
            final double[] probDefectRange = getRangeProbabilities(data, j, ranges);

            for (int i = 0; i < data.numInstances(); i++) {
                final double value = data.instance(i).value(j);
                final int range = determineRange(ranges, value);
                double probClass, probNotClass, probRangeClass, probRangeNotClass;
                if (data.instance(i).classValue() == 1) {
                    probClass = probDefect;
                    probNotClass = 1.0 - probDefect;
                    probRangeClass = probDefectRange[range];
                    probRangeNotClass = 1.0 - probDefectRange[range];
                } else {
                    probClass = 1.0 - probDefect;
                    probNotClass = probDefect;
                    probRangeClass = 1.0 - probDefectRange[range];
                    probRangeNotClass = probDefectRange[range];
                }
                powerAttributes[i][j] = Math.pow(probRangeClass, 2.0)
                        / (probRangeClass * probClass + probRangeNotClass * probNotClass);
            }
        }
    }

    for (int i = 0; i < data.numInstances(); i++) {
        powerEntity[i] = 1.0;
        for (int j = 0; j < data.numAttributes(); j++) {
            powerEntity[i] *= powerAttributes[i][j];
        }
    }
    double[] sortedPower = powerEntity.clone();
    Arrays.sort(sortedPower);
    double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];

    final Instances selected = new Instances(data);
    selected.delete();
    for (int i = 0; i < data.numInstances(); i++) {
        if (powerEntity[i] >= cutOff) {
            selected.add(data.instance(i));
        }
    }
    return selected;
}

From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java

License:Apache License

/**
 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances,
 *      weka.core.Instances)/*from  w w  w. jav  a2 s  .co m*/
 */
@Override
public Instances apply(Instances testdata, Instances traindata) {
    Instances filteredTraindata = new Instances(traindata);
    filteredTraindata.clear();

    double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1];
    int classIndex = testdata.classIndex();
    for (int i = 0; i < testdata.size(); i++) {
        int k = 0;
        for (int j = 0; j < testdata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i][k] = testdata.get(i).value(j);
                k++;
            }
        }
    }
    for (int i = 0; i < traindata.size(); i++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i + testdata.size()][k] = traindata.get(i).value(j);
                k++;
            }
        }
    }
    DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data);
    Database db = new StaticArrayDatabase(dbc, null);
    db.initialize();
    DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10);
    Clustering<Model> clusterer = dbscan.run(db);
    Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD);
    int firstInternalIndex = rel.iterDBIDs().internalGetIndex();

    for (Cluster<Model> cluster : clusterer.getAllClusters()) {
        // check if cluster has any training data
        DBIDIter iter = rel.iterDBIDs();
        boolean noMatch = true;
        for (int i = 0; noMatch && i < testdata.size(); i++) {
            noMatch = !cluster.getIDs().contains(iter);
            iter.advance();
        }
        if (!noMatch) {
            // cluster contains test data
            for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) {
                int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex;
                if (internalIndex >= 0) {
                    // index belongs to a training instance
                    filteredTraindata.add(traindata.get(internalIndex));
                }
            }

        }
    }

    return filteredTraindata;
}

From source file:de.ugoe.cs.cpdp.dataselection.MahalanobisOutlierRemoval.java

License:Apache License

/**
 * <p>// w w w  .  j  a  va 2 s .  com
 * removes all instances, whose Mahalanobi distance to the mean of the data is greater than
 * epsilon.
 * </p>
 *
 * @param data
 *            data where the outliers are removed
 */
private void applyMahalanobisDistancesRemoval(Instances data) {
    RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1);
    for (int i = 0; i < data.size(); i++) {
        values.setRow(i, WekaUtils.instanceValues(data.get(i)));
    }
    RealMatrix inverseCovariance;
    try {
        inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver()
                .getInverse();
    } catch (SingularMatrixException e) {
        Console.traceln(Level.WARNING,
                "could not perform Mahalanobis outlier removal due to singular covariance matrix");
        return;
    }
    // create mean vector
    double[] meanValues = new double[data.numAttributes() - 1];
    int k = 0;
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            meanValues[k] = data.attributeStats(j).numericStats.mean;
            k++;
        }
    }

    for (int i = data.size() - 1; i >= 0; i--) {
        double distance = mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)),
                meanValues);
        if (distance > epsilon) {
            data.remove(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java

License:Apache License

/**
 * <p>// www . j a va2 s  . c  om
 * Applies the synonym outlier removal.
 * </p>
 *
 * @param traindata
 *            data from which the outliers are removed.
 */
public void applySynonymRemoval(Instances traindata) {
    double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1];
    double minDistanceAttribute[] = new double[traindata.numAttributes() - 1];
    double distance;
    for (int j = 0; j < minDistanceAttribute.length; j++) {
        minDistanceAttribute[j] = Double.MAX_VALUE;
    }
    for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                minDistance[i1][k] = Double.MAX_VALUE;
                for (int i2 = 0; i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j));
                        if (distance < minDistance[i1][k]) {
                            minDistance[i1][k] = distance;
                        }
                        if (distance < minDistanceAttribute[k]) {
                            minDistanceAttribute[k] = distance;
                        }
                    }
                }
                k++;
            }
        }
    }
    for (int i = traindata.size() - 1; i >= 0; i--) {
        boolean hasClosest = false;
        for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) {
            hasClosest = minDistance[i][j] <= minDistanceAttribute[j];
        }
        if (!hasClosest) {
            traindata.delete(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java

License:Apache License

/**
 * Loads the given decent file and tranform it from decent->arffx->arff
 * /*w  ww .  j  av a 2s.  c  o m*/
 * @return Instances in WEKA format
 */
@Override
public Instances load(File file) {

    // Set attributeFilter
    setAttributeFilter();

    // Register MetaModels
    try {
        registerMetaModels();
    } catch (Exception e1) {
        Console.printerrln("Metamodels cannot be registered!");
        e1.printStackTrace();
    }

    // Set location of decent and arffx Model
    String decentModelLocation = file.getAbsolutePath();
    String pathToDecentModelFolder = decentModelLocation.substring(0,
            decentModelLocation.lastIndexOf(File.separator));
    String arffxModelLocation = pathToDecentModelFolder + "/model.arffx";
    String logModelLocation = pathToDecentModelFolder + "/model.log";
    String arffLocation = pathToDecentModelFolder + "/model.arff";

    // If arff File exists, load from it!
    if (new File(arffLocation).exists()) {
        System.out.println("Loading arff File...");
        BufferedReader reader;
        Instances data = null;
        try {
            reader = new BufferedReader(new FileReader(arffLocation));
            data = new Instances(reader);
            reader.close();
        } catch (FileNotFoundException e) {
            Console.printerrln("File with path: " + arffLocation + " was not found.");
            throw new RuntimeException(e);
        } catch (IOException e) {
            Console.printerrln("File with path: " + arffLocation + " cannot be read.");
            throw new RuntimeException(e);
        }

        // Set class attribute if not set
        if (data.classIndex() == -1) {
            Attribute classAttribute = data.attribute(classAttributeName);
            data.setClass(classAttribute);
        }

        return data;
    }

    // Location of EOL Scripts
    String preprocess = "./decent/epsilon/query/preprocess.eol";
    String arffxToArffSource = "./decent/epsilon/query/addLabels.eol";

    // Set Log Properties
    System.setProperty("epsilon.logLevel", logLevel);
    System.setProperty("epsilon.logToFile", logToFile);
    System.setProperty("epsilon.logFileAvailable", "false");

    // Set decent2arffx Properties
    System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false");
    System.setProperty("epsilon.transformation.decent2arffx.type", "code");

    // Preprocess Data, transform from decent2arffx
    try {
        IEolExecutableModule preProcessModule = loadModule(preprocess);
        IModel preProcessDecentModel = modelHandler.getDECENTModel(decentModelLocation, true, true);
        IModel preProcessArffxarffxModel = modelHandler.getARFFxModel(arffxModelLocation, false, true);
        preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel);
        preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel);
        execute(preProcessModule, logModelLocation);
        preProcessDecentModel.dispose();
        preProcessArffxarffxModel.dispose();
        preProcessModule.reset();
    } catch (URISyntaxException e) {
        Console.printerrln("URI Syntax for decent or arffx model is wrong.");
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

    // Transform to arff, for label and confidence attributes
    try {
        IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource);
        IModel arffxToArffArffxModel = modelHandler.getARFFxModel(arffxModelLocation, true, true);
        arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel);
        execute(arffxToArffModule, logModelLocation);
        arffxToArffArffxModel.dispose();
        // can be stored and retained alternatively
        arffxToArffModule.reset();
    } catch (URISyntaxException e) {
        Console.printerrln("URI Syntax for arffx model is wrong.");
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

    // Unregister MetaModels, otherwise cast will fail
    HashMap<String, Object> metaModelCache = new HashMap<>();
    for (String key : EPackage.Registry.INSTANCE.keySet()) {
        metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key));
    }
    ;

    for (String key : metaModelCache.keySet()) {
        EPackage.Registry.INSTANCE.remove(key);
    }
    ;

    // Workaround to gernerate a usable URI. Absolute path is not
    // possible, therefore we need to construct a relative path

    URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation();
    String basePath = location.getFile();

    // Location is the bin folder, so we need to delete the last 4 characters
    basePath = basePath.substring(0, basePath.length() - 4);
    String relativePath = new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath();

    // Loard arffx file and create WEKA Instances
    ARFFxResourceTool tool = new ARFFxResourceTool();
    Resource resource = tool.loadResourceFromXMI(relativePath, "arffx");

    Instances dataSet = null;
    for (EObject o : resource.getContents()) {
        Model m = (Model) o;
        dataSet = createWekaDataFormat(m);

        for (Instance i : m.getData()) {
            createWekaInstance(dataSet, i);
        }
    }

    // Set class attribute
    Attribute classAttribute = dataSet.attribute(classAttributeName);
    dataSet.setClass(classAttribute);

    // Save as ARFF
    save(dataSet, arffLocation);

    return dataSet;

}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>//from   w  ww .  j  a va  2  s .  co m
 * Calculates the distributional characteristics of the distances the instances within a data
 * set have to each other.
 * </p>
 *
 * @param data
 *            data for which the instances are characterized
 * @return characteristics
 */
public static DistChar datasetDistance(Instances data) {
    double distance;
    double sumAll = 0.0;
    double sumAllQ = 0.0;
    double min = Double.MAX_VALUE;
    double max = Double.MIN_VALUE;
    int numCmp = 0;
    int l = 0;
    double[] inst1 = new double[data.numAttributes() - 1];
    double[] inst2 = new double[data.numAttributes() - 1];
    EuclideanDistance euclideanDistance = new EuclideanDistance();
    for (int i = 0; i < data.numInstances(); i++) {
        l = 0;
        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != data.classIndex()) {
                inst1[l] = data.instance(i).value(k);
            }
        }
        for (int j = 0; j < data.numInstances(); j++) {
            if (j != i) {
                l = 0;
                for (int k = 0; k < data.numAttributes(); k++) {
                    if (k != data.classIndex()) {
                        inst2[l] = data.instance(j).value(k);
                    }
                }
                distance = euclideanDistance.compute(inst1, inst2);
                sumAll += distance;
                sumAllQ += distance * distance;
                numCmp++;
                if (distance < min) {
                    min = distance;
                }
                if (distance > max) {
                    max = distance;
                }
            }
        }
    }
    double mean = sumAll / numCmp;
    double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
    return new DistChar(mean, std, min, max, data.numInstances());
}

From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/*from  w w  w.j  a v a2s.  co m*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    // we assume that only this method has been used - breaks modularity, but need results fast ... :/
    SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer;

    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);
    Instances copyTrainData = new Instances(trainData);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);
    Instances centroids = clusterer.getClusterCentroids();

    //        Add addFilter = new Add();
    //        addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString());
    //        addFilter.setNominalLabels("0,1");
    //        addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
    //        addFilter.setInputFormat(testData);

    trainData.clear();

    Enumeration<Instance> centroidInstances = centroids.enumerateInstances();
    while (centroidInstances.hasMoreElements()) {
        Instance centroidInstance = centroidInstances.nextElement();

        // centroidInstance is usually not a real instance, but a virtual centroid
        // we need to find the closest point in the training data
        double minDistance = Double.POSITIVE_INFINITY;
        int offset = 0;
        int minOffset = 0;
        Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances();
        while (trainInstances.hasMoreElements()) {
            Instance trainInstance = trainInstances.nextElement();

            double dist = distance(centroidInstance, trainInstance);
            if (dist < minDistance) {
                minDistance = dist;
                minOffset = offset;
            }
            offset++;
        }

        // add selected instance to instances
        trainData.add(copyTrainData.get(minOffset));
    }

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, trainData);
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }//from  ww  w .j  a  v a  2  s . c  o m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext);

    ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>();
    for (Integer clusterId : clusterMap.keySet()) {
        System.out.println("CLUSTER: " + clusterId);
        for (Integer offset : clusterMap.get(clusterId)) {

            // get instance ID from instance
            Instance instance = copyTrainData.get(offset);

            Double classOffset = new Double(instance.value(copyTrainData.classAttribute()));
            String label = (String) trainOutcomeValues.get(classOffset.intValue());

            clusterAssignments.addSample(clusterId, label);

            String instanceId = instance
                    .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index());
            System.out.println(label + "\t" + instanceId2TextMap.get(instanceId));
        }
        System.out.println();
    }

    System.out.println("ID\tSIZE\tPURITY\tRMSE");
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId);
        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        String purityString = String.format("%.2f", purity);
        double rmse = getRMSE(fd, trainOutcomeValues);
        String rmseString = String.format("%.2f", rmse);
        System.out.println(
                clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString);
    }
    System.out.println();
}

From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/*  w w w . java2 s.com*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    // get a CFD that stores the number of outcomes for each class indexed by the clusterID
    ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData,
            trainOutcomeValues);

    Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>();
    Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>();
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId);
        mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq());

        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        // attention - cannot simply use RMSE here - as smaller values are better unlike with purity
        //           double rmse = getRMSE(fd, trainOutcomeValues);
        clusterScoreMap.put(clusterId, purity);
    }

    // sort clusters by score
    Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap));
    sortedClusters.putAll(clusterScoreMap);

    // change the outcome values of instances according to the most frequent class in its cluster

    double avgPurity = 0.0;
    int n = 0;
    for (Integer clusterId : sortedClusters.keySet()) {
        // we need to take as many clusters until we have seen at least each class once
        if (onlyPureClusters && trainOutcomeValues.size() == 0) {
            break;
        }

        //           // do not use clusters of single responses, as they always have purity of 1
        //           if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) {
        //              continue;
        //           }

        n++;
        avgPurity += clusterScoreMap.get(clusterId);

        String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId);
        trainOutcomeValues.remove(mostFrequentClass);

        for (Integer instanceOffset : clusterMap.get(clusterId)) {
            copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass);
        }
    }
    avgPurity = avgPurity / n;
    System.out.println("Average cluster purity: " + avgPurity);

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, copyTrainData);
}

From source file:de.upb.timok.oneclassclassifier.WekaSvmClassifier.java

License:Open Source License

@Override
public void train(List<double[]> trainingSamples) {
    Instances data = DatasetTransformationUtils.trainingSetToInstances(trainingSamples);
    // setting class attribute if the data format does not provide this information
    // For example, the XRFF format saves the class attribute information as well
    try {//from w  ww  .j a v a2s  .  c  o m
        if (filter != null) {
            filter.setInputFormat(data);
            data = Filter.useFilter(data, filter);
        }
        if (data.classIndex() == -1) {
            data.setClassIndex(data.numAttributes() - 1);
        }
        wekaSvm.buildClassifier(data);
    } catch (final Exception e) {
        logger.error("Unexpected exception", e);
    }

}