Example usage for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:Clustering.WekaKMeansClustererWrapper.java

public ArrayList<String>[] classify(HashMap<String, List> data, boolean clearData) {
    ArrayList<String>[] clusterResult;
    try {/*from w w w  . j  a  va2s  .  c o m*/
        File arff = m_ArffExporter.getArff(data);
        int nSize = data.size();
        if (arff == null)
            return null;
        if (clearData)
            data.clear();

        FileInputStream is = new FileInputStream(arff.getAbsolutePath());
        Instances instances = ConverterUtils.DataSource.read(is);
        is.close();

        String[] keys = new String[instances.numInstances()];
        for (int i = 0; i < instances.numInstances(); ++i) {
            Instance instance = instances.instance(i);
            keys[i] = instance.stringValue(0); // assume that the 0th attribute is the key string
        }

        instances.deleteStringAttributes();

        SimpleKMeans cl = new SimpleKMeans();

        int numClusters = m_NumberOfClusters < nSize ? m_NumberOfClusters : nSize;

        String[] options = new String[5];
        options[0] = "-O";
        options[1] = "-N";
        options[2] = Integer.toString(numClusters);
        options[3] = "-A";
        options[4] = m_DistanceFunction;

        cl.setOptions(options);

        //System.out.println( "Clustering" );
        cl.buildClusterer(instances);

        //System.out.println( "Create ArrayList" );
        clusterResult = new ArrayList[m_NumberOfClusters];
        for (int i = 0; i < m_NumberOfClusters; ++i) {
            clusterResult[i] = new ArrayList<>();
        }

        //System.out.println( "Assigning" );
        int[] assignment = cl.getAssignments();
        for (int i = 0; i < assignment.length; ++i) {
            clusterResult[assignment[i]].add(keys[i]);
        }

        //System.out.println( "Done" );
        if (!arff.delete())
            arff.deleteOnExit();
    } catch (Exception ex) {
        //System.out.println( "[EXCEPTION] " + ex.getMessage() );
        m_LastErrorMessage = ex.getMessage();
        return null;
    }

    return clusterResult;
}

From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java

License:Open Source License

/**
 * Generates a clusterer. Has to initialize all fields of the clusterer
 * that are not being set via options.//from w  ww.  j  a v  a  2 s.com
 *
 * @param data set of instances serving as training data 
 * @throws Exception if the clusterer has not been 
 * generated successfully
 */
public void buildClusterer(Instances data) throws Exception {

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
        m_ReplaceMissingFilter.setInputFormat(instances);
        instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_FullMissingCounts = new int[instances.numAttributes()];
    if (m_displayStdDevs) {
        m_FullStdDevs = new double[instances.numAttributes()];
    }
    m_FullNominalCounts = new int[instances.numAttributes()][0];

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false);
    for (int i = 0; i < instances.numAttributes(); i++) {
        m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
        if (instances.attribute(i).isNumeric()) {
            if (m_displayStdDevs) {
                m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
            }
            if (m_FullMissingCounts[i] == instances.numInstances()) {
                m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
            }
        } else {
            m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
            if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
                m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value
            }
        }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder)
        m_Assignments = clusterAssignments;

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder)
        initInstances = new Instances(instances);
    else
        initInstances = instances;

    if (m_initializeWithKMeansPlusPlus) {
        kMeansPlusPlusInit(initInstances);
    } else {
        for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
            instIndex = RandomO.nextInt(j + 1);
            hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(),
                    true);
            if (!initC.containsKey(hk)) {
                m_ClusterCentroids.add(initInstances.instance(instIndex));
                initC.put(hk, null);
            }
            initInstances.swap(j, instIndex);

            if (m_ClusterCentroids.numInstances() == m_NumClusters) {
                break;
            }
        }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    //removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
    while (!converged) {
        emptyClusterCount = 0;
        m_Iterations++;
        converged = true;
        for (i = 0; i < instances.numInstances(); i++) {
            Instance toCluster = instances.instance(i);
            int newC = clusterProcessedInstance(toCluster, false, true);
            if (newC != clusterAssignments[i]) {
                converged = false;
            }
            clusterAssignments[i] = newC;
        }

        // update centroids
        m_ClusterCentroids = new Instances(instances, m_NumClusters);
        for (i = 0; i < m_NumClusters; i++) {
            tempI[i] = new Instances(instances, 0);
        }
        for (i = 0; i < instances.numInstances(); i++) {
            tempI[clusterAssignments[i]].add(instances.instance(i));
        }
        for (i = 0; i < m_NumClusters; i++) {
            if (tempI[i].numInstances() == 0) {
                // empty cluster
                emptyClusterCount++;
            } else {
                moveCentroid(i, tempI[i], true);
            }
        }

        if (emptyClusterCount > 0) {
            m_NumClusters -= emptyClusterCount;
            if (converged) {
                Instances[] t = new Instances[m_NumClusters];
                int index = 0;
                for (int k = 0; k < tempI.length; k++) {
                    if (tempI[k].numInstances() > 0) {
                        t[index++] = tempI[k];
                    }
                }
                tempI = t;
            } else {
                tempI = new Instances[m_NumClusters];
            }
        }

        if (m_Iterations == m_MaxIterations)
            converged = true;

        if (!converged) {
            m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
        }
    }

    // calculate errors
    if (!m_FastDistanceCalc) {
        for (i = 0; i < instances.numInstances(); i++) {
            clusterProcessedInstance(instances.instance(i), true, false);
        }
    }

    if (m_displayStdDevs) {
        m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new int[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
        if (m_displayStdDevs) {
            double[] vals2 = new double[instances.numAttributes()];
            for (int j = 0; j < instances.numAttributes(); j++) {
                if (instances.attribute(j).isNumeric()) {
                    vals2[j] = Math.sqrt(tempI[i].variance(j));
                } else {
                    vals2[j] = Utils.missingValue();
                }
            }
            m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
        }
        m_ClusterSizes[i] = tempI[i].numInstances();
    }
}

From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java

License:Open Source License

protected void kMeansPlusPlusInit(Instances data) throws Exception {
    Random randomO = new Random(getSeed());
    HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>();

    // choose initial center uniformly at random
    int index = randomO.nextInt(data.numInstances());
    m_ClusterCentroids.add(data.instance(index));
    DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true);
    initC.put(hk, null);//from w ww.jav  a2 s  . co  m

    int iteration = 0;
    int remainingInstances = data.numInstances() - 1;
    if (m_NumClusters > 1) {
        // proceed with selecting the rest

        // distances to the initial randomly chose center
        double[] distances = new double[data.numInstances()];
        double[] cumProbs = new double[data.numInstances()];
        for (int i = 0; i < data.numInstances(); i++) {
            distances[i] = m_DistanceFunction.distance(data.instance(i),
                    m_ClusterCentroids.instance(iteration));
        }

        // now choose the remaining cluster centers
        for (int i = 1; i < m_NumClusters; i++) {

            // distances converted to probabilities
            double[] weights = new double[data.numInstances()];
            System.arraycopy(distances, 0, weights, 0, distances.length);
            Utils.normalize(weights);

            double sumOfProbs = 0;
            for (int k = 0; k < data.numInstances(); k++) {
                sumOfProbs += weights[k];
                cumProbs[k] = sumOfProbs;
            }

            cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no rounding issues

            // choose a random instance
            double prob = randomO.nextDouble();
            for (int k = 0; k < cumProbs.length; k++) {
                if (prob < cumProbs[k]) {
                    Instance candidateCenter = data.instance(k);
                    hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true);
                    if (!initC.containsKey(hk)) {
                        initC.put(hk, null);
                        m_ClusterCentroids.add(candidateCenter);
                    } else {
                        // we shouldn't get here because any instance that is a duplicate of
                        // an already chosen cluster center should have zero distance (and hence
                        // zero probability of getting chosen) to that center.
                        System.err.println("We shouldn't get here....");
                    }
                    remainingInstances--;
                    break;
                }
            }
            iteration++;

            if (remainingInstances == 0) {
                break;
            }

            // prepare to choose the next cluster center.
            // check distances against the new cluster center to see if it is closer
            for (int k = 0; k < data.numInstances(); k++) {
                if (distances[k] > 0) {
                    double newDist = m_DistanceFunction.distance(data.instance(k),
                            m_ClusterCentroids.instance(iteration));
                    if (newDist < distances[k]) {
                        distances[k] = newDist;
                    }
                }
            }
        }
    }
}

From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid coordinates based 
 * on it's  members (objects assigned to the cluster of the centroid) and the distance 
 * function being used.// ww  w . j a  va  2 s. c o m
 * @param centroidIndex index of the centroid which the coordinates will be computed
 * @param members the objects that are assigned to the cluster of this centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    //used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        //in case of Euclidian distance the centroid is the mean point
        //in case of Manhattan distance the centroid is the median point
        //in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance) {
            //singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                sortedMembers.kthSmallestValue(j, middle + 1);
                vals[j] = sortedMembers.instance(middle).value(j);
                if (dataIsEven) {
                    sortedMembers.kthSmallestValue(j, middle + 2);
                    vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Utils.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Utils.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo)
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    return vals;
}

From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java

License:Open Source License

public static Instance findBestPerf(Instances data) {
    int idx = data.numAttributes() - 1;
    double bestPerf = data.attributeStats(idx).numericStats.max;
    for (int i = 0; i < data.numInstances(); i++)
        if (data.get(i).value(idx) == bestPerf)
            return data.get(i);
    return null;//should never return NULL
}

From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java

License:Open Source License

public static int findBestPerfIndex(Instances data) {
    int idx = data.numAttributes() - 1;
    double bestPerf = data.attributeStats(idx).numericStats.max;
    for (int i = 0; i < data.numInstances(); i++)
        if (data.get(i).value(idx) == bestPerf)
            return i;
    return -1;//should never return -1
}

From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java

License:Open Source License

public static void testCOMT2() throws Exception {
    BestConf bestconf = new BestConf();
    Instances trainingSet = DataIOFile.loadDataFromArffFile("data/trainingBestConf0.arff");
    trainingSet.setClassIndex(trainingSet.numAttributes() - 1);

    Instances samplePoints = LHSInitializer.getMultiDimContinuous(bestconf.getAttributes(),
            InitialSampleSetSize, false);
    samplePoints.insertAttributeAt(trainingSet.classAttribute(), samplePoints.numAttributes());
    samplePoints.setClassIndex(samplePoints.numAttributes() - 1);

    COMT2 comt = new COMT2(samplePoints, COMT2Iteration);

    comt.buildClassifier(trainingSet);// w  ww  .  j  av a 2s  . c  om

    Evaluation eval = new Evaluation(trainingSet);
    eval.evaluateModel(comt, trainingSet);
    System.err.println(eval.toSummaryString());

    Instance best = comt.getInstanceWithPossibleMaxY(samplePoints.firstInstance());
    Instances bestInstances = new Instances(trainingSet, 2);
    bestInstances.add(best);
    DataIOFile.saveDataToXrffFile("data/trainingBestConf_COMT2.arff", bestInstances);

    //now we output the training set with the class value updated as the predicted value
    Instances output = new Instances(trainingSet, trainingSet.numInstances());
    Enumeration<Instance> enu = trainingSet.enumerateInstances();
    while (enu.hasMoreElements()) {
        Instance ins = enu.nextElement();
        double[] values = ins.toDoubleArray();
        values[values.length - 1] = comt.classifyInstance(ins);
        output.add(ins.copy(values));
    }
    DataIOFile.saveDataToXrffFile("data/trainingBestConf0_predict.xrff", output);
}

From source file:cn.ict.zyq.bestConf.cluster.Main.AutoTestAdjust.java

License:Open Source License

public Instances runExp(Instances samplePoints, String perfAttName) {
    Instances retVal = null;/*from   w  ww . j  a  v  a  2  s.  c  o  m*/
    if (samplePoints.attribute(perfAttName) == null) {
        Attribute performance = new Attribute(perfAttName);
        samplePoints.insertAttributeAt(performance, samplePoints.numAttributes());
    }
    int pos = samplePoints.numInstances();
    int count = 0;
    for (int i = 0; i < pos; i++) {
        Instance ins = samplePoints.get(i);
        HashMap hm = new HashMap();
        int tot = 0;
        for (int j = 0; j < ins.numAttributes(); j++) {
            hm.put(ins.attribute(j).name(), ins.value(ins.attribute(j)));
        }

        boolean testRet;
        if (Double.isNaN(ins.value(ins.attribute(ins.numAttributes() - 1)))) {
            testRet = this.startTest(hm, i, isInterrupt);
            double y = 0;
            if (!testRet) {// the setting does not work, we skip it
                y = -1;
                count++;
                if (count >= targetTestErrorNum) {
                    System.out.println(
                            "There must be somthing wrong with the system. Please check and restart.....");
                    System.exit(1);
                }
            } else {
                y = getPerformanceByType(performanceType);
                count = 0;
            }

            ins.setValue(samplePoints.numAttributes() - 1, y);
            writePerfstoFile(ins);
        } else {
            continue;
        }
    }
    retVal = samplePoints;
    retVal.setClassIndex(retVal.numAttributes() - 1);

    return retVal;
}

From source file:com.actelion.research.orbit.imageAnalysis.imaging.TMAPoints.java

License:Open Source License

private int guessNumClusters(EM clusterer, Instances instances, int start, int end) throws Exception {
    ClusterEvaluation eval = new ClusterEvaluation();
    int bestNum = start;
    double best = Double.POSITIVE_INFINITY;
    double bic;/*w w w  . j  a  v  a 2 s.c om*/
    for (int c = start; c <= end; c++) {
        clusterer.setNumClusters(c);
        clusterer.buildClusterer(instances);
        eval.setClusterer(clusterer);
        eval.evaluateClusterer(instances);
        bic = bic(eval.getLogLikelihood(), c, instances.numInstances());
        logger.trace("numCluster " + c + " -> BIC: " + bic);
        if (bic < best) {
            best = bic;
            bestNum = c;
            logger.trace("bestNum: " + bestNum);
        }
    }
    return bestNum;
}

From source file:com.github.r351574nc3.amex.assignment2.App.java

License:Open Source License

/**
 * Generates a predictive model based on a previously trained and evaluated model.
 *
 * @param inputName unlabeled model to load
 * @param outputName path to the file where results will be stored.
 *//*from   ww w . j a v a2  s  .c  o m*/
public void predict(final String inputName, final String outputName) throws Exception {
    final Instances input = load(inputName);
    final Instances labeled = new Instances(input);

    for (int i = 0; i < input.numInstances(); i++) {
        final Double clsLabel = getClassifier().classifyInstance(input.instance(i));
        labeled.instance(i).setClassValue(clsLabel);
    }

    boolean isLocal = true;
    if (outputName.contains(File.separator)) {
        isLocal = false;
    }

    final File pwd = isLocal ? new File(System.getProperty(USER_DIR_KEY))
            : new File(outputName).getParentFile();
    if (pwd.exists() && pwd.isDirectory()) {
        DataSink.write(outputName, labeled);
    } else {
        throw new FileNotFoundException("Cannot write to " + outputName);
    }
}