Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java

License:Apache License

/**
 * Loads the given decent file and tranform it from decent->arffx->arff
 * //from w  w w . j a va  2 s.  c o  m
 * @return Instances in WEKA format
 */
@Override
public Instances load(File file) {

    // Set attributeFilter
    setAttributeFilter();

    // Register MetaModels
    try {
        registerMetaModels();
    } catch (Exception e1) {
        Console.printerrln("Metamodels cannot be registered!");
        e1.printStackTrace();
    }

    // Set location of decent and arffx Model
    String decentModelLocation = file.getAbsolutePath();
    String pathToDecentModelFolder = decentModelLocation.substring(0,
            decentModelLocation.lastIndexOf(File.separator));
    String arffxModelLocation = pathToDecentModelFolder + "/model.arffx";
    String logModelLocation = pathToDecentModelFolder + "/model.log";
    String arffLocation = pathToDecentModelFolder + "/model.arff";

    // If arff File exists, load from it!
    if (new File(arffLocation).exists()) {
        System.out.println("Loading arff File...");
        BufferedReader reader;
        Instances data = null;
        try {
            reader = new BufferedReader(new FileReader(arffLocation));
            data = new Instances(reader);
            reader.close();
        } catch (FileNotFoundException e) {
            Console.printerrln("File with path: " + arffLocation + " was not found.");
            throw new RuntimeException(e);
        } catch (IOException e) {
            Console.printerrln("File with path: " + arffLocation + " cannot be read.");
            throw new RuntimeException(e);
        }

        // Set class attribute if not set
        if (data.classIndex() == -1) {
            Attribute classAttribute = data.attribute(classAttributeName);
            data.setClass(classAttribute);
        }

        return data;
    }

    // Location of EOL Scripts
    String preprocess = "./decent/epsilon/query/preprocess.eol";
    String arffxToArffSource = "./decent/epsilon/query/addLabels.eol";

    // Set Log Properties
    System.setProperty("epsilon.logLevel", logLevel);
    System.setProperty("epsilon.logToFile", logToFile);
    System.setProperty("epsilon.logFileAvailable", "false");

    // Set decent2arffx Properties
    System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false");
    System.setProperty("epsilon.transformation.decent2arffx.type", "code");

    // Preprocess Data, transform from decent2arffx
    try {
        IEolExecutableModule preProcessModule = loadModule(preprocess);
        IModel preProcessDecentModel = modelHandler.getDECENTModel(decentModelLocation, true, true);
        IModel preProcessArffxarffxModel = modelHandler.getARFFxModel(arffxModelLocation, false, true);
        preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel);
        preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel);
        execute(preProcessModule, logModelLocation);
        preProcessDecentModel.dispose();
        preProcessArffxarffxModel.dispose();
        preProcessModule.reset();
    } catch (URISyntaxException e) {
        Console.printerrln("URI Syntax for decent or arffx model is wrong.");
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

    // Transform to arff, for label and confidence attributes
    try {
        IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource);
        IModel arffxToArffArffxModel = modelHandler.getARFFxModel(arffxModelLocation, true, true);
        arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel);
        execute(arffxToArffModule, logModelLocation);
        arffxToArffArffxModel.dispose();
        // can be stored and retained alternatively
        arffxToArffModule.reset();
    } catch (URISyntaxException e) {
        Console.printerrln("URI Syntax for arffx model is wrong.");
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

    // Unregister MetaModels, otherwise cast will fail
    HashMap<String, Object> metaModelCache = new HashMap<>();
    for (String key : EPackage.Registry.INSTANCE.keySet()) {
        metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key));
    }
    ;

    for (String key : metaModelCache.keySet()) {
        EPackage.Registry.INSTANCE.remove(key);
    }
    ;

    // Workaround to gernerate a usable URI. Absolute path is not
    // possible, therefore we need to construct a relative path

    URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation();
    String basePath = location.getFile();

    // Location is the bin folder, so we need to delete the last 4 characters
    basePath = basePath.substring(0, basePath.length() - 4);
    String relativePath = new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath();

    // Loard arffx file and create WEKA Instances
    ARFFxResourceTool tool = new ARFFxResourceTool();
    Resource resource = tool.loadResourceFromXMI(relativePath, "arffx");

    Instances dataSet = null;
    for (EObject o : resource.getContents()) {
        Model m = (Model) o;
        dataSet = createWekaDataFormat(m);

        for (Instance i : m.getData()) {
            createWekaInstance(dataSet, i);
        }
    }

    // Set class attribute
    Attribute classAttribute = dataSet.attribute(classAttributeName);
    dataSet.setClass(classAttribute);

    // Save as ARFF
    save(dataSet, arffLocation);

    return dataSet;

}

From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java

License:Apache License

/**
 * Creates a WekaInstance from an ARFFX Model Instance
 * //from  w w  w  .j a v  a  2s  .  c  om
 * @param dataSet
 *            WekaInstance dataset, where the arffx model instances should be added to
 * @param i
 *            arffx model instance
 */
private void createWekaInstance(Instances dataSet, Instance i) {
    double[] values = new double[dataSet.numAttributes()];
    int j = 0;

    for (Value value : i.getValues()) {
        String dataValue = value.getContent();
        String attributeName = value.getOfAttribute().getName();

        if (attributeFilter.contains(attributeName)) {
            continue;
        }

        // Is value a LABEL.* attribute?
        if (isLabel(attributeName)) {
            values[j] = dataSet.attribute(j).indexOfValue(dataValue);
        } else if (isConfidenceLabel(attributeName)) {
            // Is value a CONFIDENCE.* attribute?
            values[j] = dataSet.attribute(j).indexOfValue(dataValue);
        } else if (attributeName.equals("Artifact.Name")) {
            // Is it the name of the artifact?
            artifactNames.add(dataValue);
            values[j] = getIndexOfArtifactName(dataValue);
        } else {
            // Is it a numeric value?
            values[j] = Double.parseDouble(dataValue);
        }

        j++;
    }

    DenseInstance inst = new DenseInstance(1.0, values);
    dataSet.add(inst);
}

From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java

License:Apache License

@Override
public Instances load(File fileMetricsFile) {
    // first determine all files
    String path = fileMetricsFile.getParentFile().getAbsolutePath();
    String project = fileMetricsFile.getName().split("_")[0];
    File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
    File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
    Instances metricsData = null;/*  w ww.  j  a  va 2 s . c o  m*/

    try {
        CSVLoader wekaCsvLoader = new CSVLoader();
        wekaCsvLoader.setSource(fileMetricsFile);
        metricsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(bugsFile);
        Instances bugsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(networkMetrics);
        Instances networkData = wekaCsvLoader.getDataSet();

        metricsData.setRelationName(project);

        // fix nominal attributes (i.e., NA values)
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isNominal()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }
        // fix string attributes
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isString()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }

        Map<String, Integer> filenames = new HashMap<>();
        for (int j = 0; j < metricsData.size(); j++) {
            filenames.put(metricsData.instance(j).stringValue(0), j);
        }
        // merge with network data
        int attributeIndex;
        for (int j = 2; j < networkData.numAttributes(); j++) {
            attributeIndex = metricsData.numAttributes();
            metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
            for (int i = 0; i < networkData.size(); i++) {
                Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex,
                            networkData.instance(i).value(j));
                }
            }
        }

        // add bug information
        attributeIndex = metricsData.numAttributes();
        final ArrayList<String> classAttVals = new ArrayList<String>();
        classAttVals.add("0");
        classAttVals.add("1");
        final Attribute classAtt = new Attribute("bug", classAttVals);
        metricsData.insertAttributeAt(classAtt, attributeIndex);
        for (int i = 0; i < bugsData.size(); i++) {
            if (bugsData.instance(i).value(2) > 0.0d) {
                Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
                }
            }
        }

        // remove filenames
        metricsData.deleteAttributeAt(0);
        Attribute eigenvector = metricsData.attribute("eigenvector");
        if (eigenvector != null) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.attribute(j) == eigenvector) {
                    metricsData.deleteAttributeAt(j);
                }
            }
        }

        metricsData.setClassIndex(metricsData.numAttributes() - 1);

        // set all missing values to 0
        for (int i = 0; i < metricsData.size(); i++) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.instance(i).isMissing(j)) {
                    metricsData.instance(i).setValue(j, 0.0d);
                }
            }
        }
    } catch (IOException e) {
        Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
        metricsData = null;
    }
    return metricsData;
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }//from   w w  w .j av  a2s .c  o m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext);

    ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>();
    for (Integer clusterId : clusterMap.keySet()) {
        System.out.println("CLUSTER: " + clusterId);
        for (Integer offset : clusterMap.get(clusterId)) {

            // get instance ID from instance
            Instance instance = copyTrainData.get(offset);

            Double classOffset = new Double(instance.value(copyTrainData.classAttribute()));
            String label = (String) trainOutcomeValues.get(classOffset.intValue());

            clusterAssignments.addSample(clusterId, label);

            String instanceId = instance
                    .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index());
            System.out.println(label + "\t" + instanceId2TextMap.get(instanceId));
        }
        System.out.println();
    }

    System.out.println("ID\tSIZE\tPURITY\tRMSE");
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId);
        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        String purityString = String.format("%.2f", purity);
        double rmse = getRMSE(fd, trainOutcomeValues);
        String rmseString = String.format("%.2f", rmse);
        System.out.println(
                clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString);
    }
    System.out.println();
}

From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java

License:Apache License

/**
 * Creates an Instance object for the specified List of Features.
 * <br>/*from   w  w w . j a  va  2s . c  o m*/
 * Extracts the Instance objects from a source file and suppresses all features but the ones 
 * specified.
 * 
 * @param fileName File to the training results in ARFF format.
 * @param features List of {@link AbstractFeatureExtractor}s which are currently being tested.
 * @return Instances object consisting of the desired attribute structure.
 * @throws Exception If the ARFF file couldn't be read, an exception is thrown.
 */
public Instances createInstances(String fileName, List<AbstractFeatureExtractor> features) throws Exception {
    final Instances train = new Instances(new BufferedReader(new FileReader(fileName)));
    ArrayList<Attribute> newAttributes = new ArrayList<Attribute>();

    for (int i = 0; i < train.numAttributes(); i++) {
        for (AbstractFeatureExtractor feature : features) {
            if (train.attribute(i).name().equals(feature.getName())) {
                newAttributes.add(train.attribute(i));

                continue;
            }
        }
    }

    /* 
     * add the last two features (ACR-System + correct/false predictions) as those 
     * are no features gathered by a FeatureExtractor.
     */
    newAttributes.add(train.attribute(train.numAttributes() - 2));
    newAttributes.add(train.attribute(train.numAttributes() - 1));
    Instances trainCopy = copyInstances(train, newAttributes);
    trainCopy.setClassIndex(trainCopy.numAttributes() - 1);

    return trainCopy;
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java

License:Open Source License

/**
 * Generates a clusterer. Has to initialize all fields of the clusterer that
 * are not being set via options.//from   w  w w . java2s . c om
 * 
 * @param data set of instances serving as training data
 * @throws Exception if the clusterer has not been generated successfully
 */
@Override
public void buildClusterer(Instances data) throws Exception {

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
        m_ReplaceMissingFilter.setInputFormat(instances);
        instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_FullMissingCounts = new int[instances.numAttributes()];
    if (m_displayStdDevs) {
        m_FullStdDevs = new double[instances.numAttributes()];
    }
    m_FullNominalCounts = new int[instances.numAttributes()][0];

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false);
    for (int i = 0; i < instances.numAttributes(); i++) {
        m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
        if (instances.attribute(i).isNumeric()) {
            if (m_displayStdDevs) {
                m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
            }
            if (m_FullMissingCounts[i] == instances.numInstances()) {
                m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
            }
        } else {
            m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
            if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
                m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
                                                     // value
            }
        }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder) {
        m_Assignments = clusterAssignments;
    }

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder) {
        initInstances = new Instances(instances);
    } else {
        initInstances = instances;
    }

    for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
        instIndex = RandomO.nextInt(j + 1);
        hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true);
        if (!initC.containsKey(hk)) {
            m_ClusterCentroids.add(initInstances.instance(instIndex));
            initC.put(hk, null);
        }
        initInstances.swap(j, instIndex);

        if (m_ClusterCentroids.numInstances() == m_NumClusters) {
            break;
        }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    // removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
    while (!converged) {
        emptyClusterCount = 0;
        m_Iterations++;
        converged = true;
        for (i = 0; i < instances.numInstances(); i++) {
            Instance toCluster = instances.instance(i);
            int newC = clusterProcessedInstance(toCluster, true);
            if (newC != clusterAssignments[i]) {
                converged = false;
            }
            clusterAssignments[i] = newC;
        }

        // update centroids
        m_ClusterCentroids = new Instances(instances, m_NumClusters);
        for (i = 0; i < m_NumClusters; i++) {
            tempI[i] = new Instances(instances, 0);
        }
        for (i = 0; i < instances.numInstances(); i++) {
            tempI[clusterAssignments[i]].add(instances.instance(i));
        }
        for (i = 0; i < m_NumClusters; i++) {
            if (tempI[i].numInstances() == 0) {
                // empty cluster
                emptyClusterCount++;
            } else {
                moveCentroid(i, tempI[i], true);
            }
        }

        if (m_Iterations == m_MaxIterations) {
            converged = true;
        }

        if (emptyClusterCount > 0) {
            m_NumClusters -= emptyClusterCount;
            if (converged) {
                Instances[] t = new Instances[m_NumClusters];
                int index = 0;
                for (int k = 0; k < tempI.length; k++) {
                    if (tempI[k].numInstances() > 0) {
                        t[index] = tempI[k];

                        for (i = 0; i < tempI[k].numAttributes(); i++) {
                            m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i];
                        }
                        index++;
                    }
                }
                tempI = t;
            } else {
                tempI = new Instances[m_NumClusters];
            }
        }

        if (!converged) {
            m_squaredErrors = new double[m_NumClusters];
            m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
        }
    }

    if (m_displayStdDevs) {
        m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new int[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
        if (m_displayStdDevs) {
            double[] vals2 = new double[instances.numAttributes()];
            for (int j = 0; j < instances.numAttributes(); j++) {
                if (instances.attribute(j).isNumeric()) {
                    vals2[j] = Math.sqrt(tempI[i].variance(j));
                } else {
                    vals2[j] = Instance.missingValue();
                }
            }
            m_ClusterStdDevs.add(new Instance(1.0, vals2));
        }
        m_ClusterSizes[i] = tempI[i].numInstances();
    }

    // Save memory!!
    m_DistanceFunction.clean();
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * //from   ww  w.  j  av  a2 s . co m
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance
            || m_DistanceFunction instanceof CustomPairWiseDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance
                || m_DistanceFunction instanceof CustomPairWiseDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                if (dataIsEven) {
                    vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Instance.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Instance.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo) {
        m_ClusterCentroids.add(new Instance(1.0, vals));
    }
    return vals;
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedoids.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * //from  w  w  w . j  a v  a2s. c o m
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    if (!updateClusterInfo) {
        vals[0] = 100D;
        return vals;
    }

    double smallestError = Double.MAX_VALUE;
    Instance currentCentroid = null;

    for (int j = 0; j < members.numInstances(); j++) {

        Instance currentInstance = members.instance(j);
        double distanceError = 0D;
        for (int i = 0; i < members.numInstances(); i++) {
            distanceError += m_DistanceFunction.distance(currentInstance, members.instance(i));
        }
        if (distanceError < smallestError) {
            smallestError = distanceError;
            currentCentroid = currentInstance;
        }
    }

    vals[0] = currentCentroid.valueSparse(0);

    for (int j = 0; j < members.numAttributes(); j++) {
        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Instance.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Instance.missingValue(); // mark mean as missing
                }
            }
        }
    }

    if (updateClusterInfo) {
        m_ClusterCentroids.add(new Instance(1.0, vals));
    }
    return vals;
}

From source file:decisiontree.MyC45.java

/**
* Method for building an C45 tree.//from  w  w  w  . ja v  a2s. c  om
*
* @param instances the training data
* @exception Exception if decision tree can't be built successfully
*/
private void makeTree(Instances instances) throws Exception {

    // Check if no instances have reached this node.
    if (instances.numInstances() == 0) {
        m_Attribute = null;
        m_ClassValue = Instance.missingValue();
        m_Distribution = new double[instances.numClasses()];
        return;
    }

    // Compute attribute with maximum gain ratio.
    double[] gainRatios = new double[instances.numAttributes()];
    Enumeration attrEnum = instances.enumerateAttributes();
    while (attrEnum.hasMoreElements()) {
        Attribute attr = (Attribute) attrEnum.nextElement();
        if (attr.isNominal()) {
            gainRatios[attr.index()] = computeGainRatio(instances, attr);
        } else if (attr.isNumeric()) {
            gainRatios[attr.index()] = computeGainRatio(instances, attr, computeThreshold(instances, attr));
        }
    }
    m_Attribute = instances.attribute(Utils.maxIndex(gainRatios));

    // Make leaf if gain ratio is zero. 
    // Otherwise create successors.
    if (Utils.eq(gainRatios[m_Attribute.index()], 0)) {
        m_Attribute = null;
        m_Distribution = new double[instances.numClasses()];
        Enumeration instEnum = instances.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            m_Distribution[(int) inst.classValue()]++;
        }
        Utils.normalize(m_Distribution);
        m_ClassValue = Utils.maxIndex(m_Distribution);
        m_ClassAttribute = instances.classAttribute();
    } else {
        Instances[] splitData = null;
        int child = 0;
        if (m_Attribute.isNominal()) {
            child = m_Attribute.numValues();
            splitData = splitData(instances, m_Attribute);
        } else if (m_Attribute.isNumeric()) {
            child = 2;
            splitData = splitData(instances, m_Attribute, computeThreshold(instances, m_Attribute));
        }
        m_Successors = new MyC45[child];
        for (int j = 0; j < child; j++) {
            m_Successors[j] = new MyC45();
            m_Successors[j].makeTree(splitData[j]);
        }
    }
}

From source file:decisiontree.MyID3.java

private void makeTree(Instances data) {
    // Check if no instances have reached this node.  
    if (data.numInstances() == 0) {
        splitAttr = null;//from  ww w  .  j  a v  a  2  s  .  c  o m
        leafValue = Double.NaN;
        leafDist = new double[data.numClasses()];
        return;
    }

    if (data.numDistinctValues(data.classIndex()) == 1) {
        leafValue = data.firstInstance().classValue();
        return;
    }

    // Compute attribute with maximum information gain.  
    double[] infoGains = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        infoGains[att.index()] = computeInfoGain(data, att);
    }
    splitAttr = data.attribute(maxIndex(infoGains));

    // Make leaf if information gain is zero.   
    // Otherwise create successors.  
    if (Utils.eq(infoGains[splitAttr.index()], 0)) {
        splitAttr = null;
        leafDist = new double[data.numClasses()];
        Enumeration instEnum = data.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            leafDist[(int) inst.classValue()]++;
        }
        normalize(leafDist);
        leafValue = Utils.maxIndex(leafDist);
        classAttr = data.classAttribute();
    } else {
        Instances[] splitData = splitData(data, splitAttr);
        child = new MyID3[splitAttr.numValues()];
        for (int j = 0; j < splitAttr.numValues(); j++) {
            child[j] = new MyID3();
            child[j].makeTree(splitData[j]);
        }
    }
}