Example usage for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:ID3Chi.java

License:Open Source License

/**
 * Computes Chi-Square element for given subset.
 *
 * @param subset/*from  w  w  w.j  a  va  2 s  . c  o m*/
 *            the data for which info gain is to be computed
 * @param att
 *            the attribute
 * @setClassCounts class counts for initial set of instances
 * @setNumInstances number of instances for set of data
 * @return the chi-square for the given attribute and data
 * @throws Exception
 *             if computation fails
 */
private double computeChiSquareForSubset(Instances subset, Attribute att, double[] setClassCounts,
        double setNumInstances) {

    double[] subsetClassCounts = GetClassCounts(subset);
    double result = 0;
    double d = subset.numInstances() / setNumInstances;
    for (int j = 0; j < subset.numClasses(); j++) {
        double ciNew = setClassCounts[j] * d;
        if (ciNew > 0) {
            result += Math.pow(subsetClassCounts[j] - ciNew, 2) / ciNew;
        }
    }
    return result;
}

From source file:ID3Chi.java

License:Open Source License

/**
 * Computes information gain for an attribute.
 *
 * @param data//from   w  w  w. j  a va2s .c om
 *            the data for which info gain is to be computed
 * @param att
 *            the attribute
 * @param entropyOfAllData
 *            entropy of data set
 * @return the information gain for the given attribute and data
 * @throws Exception
 *             if computation fails
 */
private double computeInfoGain(Instances data, Attribute att, double entropyOfAllData) throws Exception {

    double infoGain = entropyOfAllData;
    Instances[] subset = splitData(data, att);

    int numUnknown = subset[att.numValues()].numInstances();
    if (numUnknown == data.numInstances()) {
        return 0;
    }

    double[] classCountsUnknownData = GetClassCounts(subset[att.numValues()]);

    for (int j = 0; j < att.numValues(); j++) {
        if (subset[j].numInstances() > 0) {
            double ratio = (double) subset[j].numInstances() / (double) data.numInstances();
            infoGain -= (((double) subset[j].numInstances() + (double) numUnknown * ratio)
                    / (double) data.numInstances())
                    * computeEntropyWithUnknowns(subset[j], subset[att.numValues()], classCountsUnknownData,
                            ratio);
        }
    }
    return infoGain;
}

From source file:ID3Chi.java

License:Open Source License

/**
 * Computes the entropy of a dataset./*from   w  w  w.j  a v a  2 s.c o m*/
 * 
 * @param data
 *            the data for which entropy is to be computed
 * @return the entropy of the data's class distribution
 * @throws Exception
 *             if computation fails
 */
private double computeEntropy(Instances data) throws Exception {

    double[] classCounts = GetClassCounts(data);
    double entropy = 0;
    for (int j = 0; j < data.numClasses(); j++) {
        if (classCounts[j] > 0) {
            entropy -= classCounts[j] * Utils.log2(classCounts[j]);
        }
    }
    entropy /= (double) data.numInstances();
    return entropy + Utils.log2(data.numInstances());
}

From source file:ID3Chi.java

License:Open Source License

private double computeEntropyWithUnknowns(Instances data, Instances unknownData,
        double[] classCountsUnknownData, double ratio) throws Exception {

    double[] classCounts = GetClassCounts(data);
    double entropy = 0;
    for (int j = 0; j < data.numClasses(); j++) {
        double p = classCounts[j] + classCountsUnknownData[j] * ratio;
        if (p > 0) {
            entropy -= p * Utils.log2(p);
        }//from w  w  w . j  av  a2 s  .  c  o m
    }
    entropy /= (double) data.numInstances();
    return entropy + Utils.log2(data.numInstances());
}

From source file:ID3Chi.java

License:Open Source License

/**
 * Splits a dataset according to the values of a nominal attribute.
 *
 * @param data//w  w  w .  j  a v a  2 s. c  o m
 *            the data which is to be split
 * @param att
 *            the attribute to be used for splitting
 * @return the sets of instances produced by the split
 */
private Instances[] splitData(Instances data, Attribute att) {

    // [att.numValues()] is location for "unknown" values
    Instances[] subset = new Instances[att.numValues() + 1];
    for (int j = 0; j <= att.numValues(); j++) {
        subset[j] = new Instances(data, data.numInstances());
    }

    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        if (inst.isMissing(att)) {
            subset[att.numValues()].add(inst);
        } else {
            subset[(int) inst.value(att)].add(inst);
        }
    }
    for (int i = 0; i < subset.length; i++) {
        subset[i].compactify();
    }
    return subset;
}

From source file:MachinLearningInterface.java

private void jButton7ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton7ActionPerformed
    Instances data;/*  w ww . ja v  a 2  s.  c o  m*/
    try {
        data = new Instances(new BufferedReader(new FileReader(this.name3 + ".arff")));
        Instances newData = null;
        Add filter;
        newData = new Instances(data);
        filter = new Add();
        filter.setAttributeIndex("last");
        filter.setNominalLabels("rods,punctua,networks");
        filter.setAttributeName("target");
        filter.setInputFormat(newData);
        newData = Filter.useFilter(newData, filter);
        System.out.print(newData);
        Vector vec = new Vector();
        newData.setClassIndex(newData.numAttributes() - 1);
        if (!newData.equalHeaders(newData)) {
            throw new IllegalArgumentException("Train and test are not compatible!");
        }

        URL urlToModel = this.getClass().getResource("/" + "Final.model");
        InputStream stream = urlToModel.openStream();

        Classifier cls = (Classifier) weka.core.SerializationHelper.read(stream);
        System.out.println("PROVANT MODEL.classifyInstance");
        for (int i = 0; i < newData.numInstances(); i++) {
            double pred = cls.classifyInstance(newData.instance(i));
            double[] dist = cls.distributionForInstance(newData.instance(i));
            System.out.print((i + 1) + " - ");
            System.out.print(newData.classAttribute().value((int) pred) + " - ");
            //txtarea2.setText(Utils.arrayToString(dist));

            System.out.println(Utils.arrayToString(dist));

            vec.add(newData.classAttribute().value((int) pred));

        }
        int p = 0, n = 0, r = 0;

        //txtarea2.append(Utils.arrayToString(this.target));
        for (Object vec1 : vec) {
            if ("rods".equals(vec1.toString())) {
                r = r + 1;
            }
            if ("punctua".equals(vec1.toString())) {
                p = p + 1;
            }
            if ("networks".equals(vec1.toString())) {
                n = n + 1;
            }

            PrintWriter out = null;
            try {

                out = new PrintWriter(this.name3 + "_morphology.txt");
                out.println(vec);
                out.close();
            } catch (Exception ex) {
                ex.printStackTrace();
            }
            //System.out.println(vec.get(i));
        }
        System.out.println("VECTOR-> punctua: " + p + ", rods: " + r + ", networks: " + n);
        IJ.showMessage(
                "Your file:" + this.name3 + "arff" + "\nhas been analysed, and it is composed by-> punctua: "
                        + p + ", rods: " + r + ", networks: " + n);

    } catch (IOException ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    } catch (Exception ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    }

    IJ.showMessage("analysing complete ");
}

From source file:MachinLearningInterface.java

private void jButton10ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton10ActionPerformed
    Instances data;/* w  w  w.j a v  a  2  s .co m*/
    try {
        data = new Instances(new BufferedReader(new FileReader(this.name3 + ".arff")));
        Instances newData = null;
        Add filter;
        newData = new Instances(data);
        filter = new Add();
        filter.setAttributeIndex("last");
        filter.setNominalLabels(this.liststring);
        filter.setAttributeName("target");
        filter.setInputFormat(newData);
        newData = Filter.useFilter(newData, filter);
        System.out.print(newData);
        Vector vec = new Vector();
        newData.setClassIndex(newData.numAttributes() - 1);

        if (!newData.equalHeaders(newData)) {
            throw new IllegalArgumentException("Train and test are not compatible!");
        }

        Classifier cls = (Classifier) weka.core.SerializationHelper.read(this.model);
        System.out.println("PROVANT MODEL.classifyInstance");
        for (int i = 0; i < newData.numInstances(); i++) {
            double pred = cls.classifyInstance(newData.instance(i));
            double[] dist = cls.distributionForInstance(newData.instance(i));
            System.out.print((i + 1) + " - ");
            System.out.print(newData.classAttribute().value((int) pred) + " - ");
            //txtarea2.setText(Utils.arrayToString(dist));

            System.out.println(Utils.arrayToString(dist));

            vec.add(newData.classAttribute().value((int) pred));
            //txtarea2.append(Utils.arrayToString(dist));

        }
        URL urlToModel = this.getClass().getResource("/" + "Final.model");
        InputStream stream = urlToModel.openStream();

        Classifier cls2 = (Classifier) weka.core.SerializationHelper.read(stream);
        System.out.println("PROVANT MODEL.classifyInstance");
        for (int i = 0; i < newData.numInstances(); i++) {
            double pred = cls2.classifyInstance(newData.instance(i));
            double[] dist = cls2.distributionForInstance(newData.instance(i));
            System.out.print((i + 1) + " - ");
            System.out.print(newData.classAttribute().value((int) pred) + " - ");
            //txtarea2.setText(Utils.arrayToString(dist));

            System.out.println(Utils.arrayToString(dist));

            vec.add(newData.classAttribute().value((int) pred));

        }
        int p = 0, n = 0, r = 0;

        //txtarea2.append(Utils.arrayToString(this.target));
        for (Object vec1 : vec) {
            if ("rods".equals(vec1.toString())) {
                r = r + 1;
            }
            if ("punctua".equals(vec1.toString())) {
                p = p + 1;
            }
            if ("networks".equals(vec1.toString())) {
                n = n + 1;
            }

            PrintWriter out = null;
            try {

                out = new PrintWriter(this.name3 + "_morphology.txt");
                out.println(vec);
                out.close();
            } catch (Exception ex) {
                ex.printStackTrace();
            }
            //System.out.println(vec.get(i));
        }
        System.out.println("VECTOR-> punctua: " + p + ", rods: " + r + ", networks: " + n);
        IJ.showMessage(
                "Your file:" + this.name3 + "arff" + "\nhas been analysed, and it is composed by-> punctua: "
                        + p + ", rods: " + r + ", networks: " + n);
        //txtarea2.setText("Your file:" + this.name3 + ".arff"
        //+ "\nhas been analysed, and it is composed by-> punctua: " + p + ", rods: " + r + ", networks: " + n
        //+ "\n"
        //+ "\nAnalyse complete");
        //txtarea.setText("Analyse complete");

    } catch (IOException ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    } catch (Exception ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    }

    IJ.showMessage("analysing complete "); // TODO add your handling code here:         // TODO add your handling code here:
}

From source file:MPCKMeans.java

License:Open Source License

/**
 * Clusters unlabeledData and labeledData (with labels removed),
 * using constraints in labeledPairs to initialize
 *
 * @param labeledPairs labeled pairs to be used to initialize
 * @param unlabeledData unlabeled instances
 * @param labeledData labeled instances/* w  w w. j a  v a2  s  . c o  m*/
 * @param numClusters number of clusters
 * @param startingIndexOfTest starting index of test set in unlabeled data
 * @exception Exception if something goes wrong.  */
public void buildClusterer(ArrayList labeledPairs, Instances unlabeledData, Instances labeledData,
        int numClusters, int startingIndexOfTest) throws Exception {
    m_TotalTrainWithLabels = labeledData;

    if (labeledPairs != null) {
        m_SeedHash = new HashSet((int) (unlabeledData.numInstances() / 0.75 + 10));
        m_ConstraintsHash = new HashMap();
        m_instanceConstraintHash = new HashMap();

        for (int i = 0; i < labeledPairs.size(); i++) {
            InstancePair pair = (InstancePair) labeledPairs.get(i);
            Integer firstInt = new Integer(pair.first);
            Integer secondInt = new Integer(pair.second);

            // for first point 
            if (!m_SeedHash.contains(firstInt)) { // add instances with constraints to seedHash
                if (m_verbose) {
                    System.out.println("Adding " + firstInt + " to seedHash");
                }
                m_SeedHash.add(firstInt);
            }

            // for second point 
            if (!m_SeedHash.contains(secondInt)) {
                m_SeedHash.add(secondInt);
                if (m_verbose) {
                    System.out.println("Adding " + secondInt + " to seedHash");
                }
            }
            if (pair.first >= pair.second) {
                throw new Exception("Ordering reversed - something wrong!!");
            } else {
                InstancePair newPair = null;
                newPair = new InstancePair(pair.first, pair.second, InstancePair.DONT_CARE_LINK);
                m_ConstraintsHash.put(newPair, new Integer(pair.linkType)); // WLOG first < second
                if (m_verbose) {
                    System.out.println(
                            "Adding constraint (" + pair.first + "," + pair.second + "), " + pair.linkType);
                }

                // hash the constraints for the instances involved
                Object constraintList1 = m_instanceConstraintHash.get(firstInt);
                if (constraintList1 == null) {
                    ArrayList constraintList = new ArrayList();
                    constraintList.add(pair);
                    m_instanceConstraintHash.put(firstInt, constraintList);
                } else {
                    ((ArrayList) constraintList1).add(pair);
                }
                Object constraintList2 = m_instanceConstraintHash.get(secondInt);
                if (constraintList2 == null) {
                    ArrayList constraintList = new ArrayList();
                    constraintList.add(pair);
                    m_instanceConstraintHash.put(secondInt, constraintList);
                } else {
                    ((ArrayList) constraintList2).add(pair);
                }
            }
        }
    }

    m_StartingIndexOfTest = startingIndexOfTest;
    if (m_verbose) {
        System.out.println("Starting index of test: " + m_StartingIndexOfTest);
    }

    // learn metric using labeled data,
    // then cluster both the labeled and unlabeled data
    System.out.println("Initializing metric: " + m_metric);
    m_metric.buildMetric(unlabeledData);
    m_metricBuilt = true;
    m_metricLearner.setMetric(m_metric);
    m_metricLearner.setClusterer(this);

    // normalize all data for SPKMeans
    if (m_metric.doesNormalizeData()) {
        for (int i = 0; i < unlabeledData.numInstances(); i++) {
            m_metric.normalizeInstanceWeighted(unlabeledData.instance(i));
        }
    }

    // either create a new metric if multiple metrics,
    // or just point them all to m_metric
    m_metrics = new LearnableMetric[numClusters];
    m_metricLearners = new MPCKMeansMetricLearner[numClusters];
    for (int i = 0; i < m_metrics.length; i++) {
        if (m_useMultipleMetrics) {
            m_metrics[i] = (LearnableMetric) m_metric.clone();
            m_metricLearners[i] = (MPCKMeansMetricLearner) m_metricLearner.clone();
            m_metricLearners[i].setMetric(m_metrics[i]);
            m_metricLearners[i].setClusterer(this);
        } else {
            m_metrics[i] = m_metric;
            m_metricLearners[i] = m_metricLearner;
        }
    }
    buildClusterer(unlabeledData, numClusters);
}

From source file:MPCKMeans.java

License:Open Source License

/**
 * Generates a clusterer. Instances in data have to be
 * either all sparse or all non-sparse//from   w w  w  .  j av a  2 s. co  m
 *
 * @param data set of instances serving as training data 
 * @exception Exception if the clusterer has not been 
 * generated successfully
 */
public void buildClusterer(Instances data) throws Exception {
    System.out.println("ML weight=" + m_MLweight);
    System.out.println("CL weight= " + m_CLweight);
    System.out.println("LOG term weight=" + m_logTermWeight);
    System.out.println("Regularizer weight= " + m_regularizerTermWeight);
    m_RandomNumberGenerator = new Random(m_RandomSeed);

    if (m_metric instanceof OfflineLearnableMetric) {
        m_isOfflineMetric = true;
    } else {
        m_isOfflineMetric = false;
    }

    // Don't rebuild the metric if it was already trained
    if (!m_metricBuilt) {
        m_metric.buildMetric(data);
        m_metricBuilt = true;
        m_metricLearner.setMetric(m_metric);
        m_metricLearner.setClusterer(this);

        m_metrics = new LearnableMetric[m_NumClusters];
        m_metricLearners = new MPCKMeansMetricLearner[m_NumClusters];
        for (int i = 0; i < m_metrics.length; i++) {
            if (m_useMultipleMetrics) {
                m_metrics[i] = (LearnableMetric) m_metric.clone();
                m_metricLearners[i] = (MPCKMeansMetricLearner) m_metricLearner.clone();
                m_metricLearners[i].setMetric(m_metrics[i]);
                m_metricLearners[i].setClusterer(this);
            } else {
                m_metrics[i] = m_metric;
                m_metricLearners[i] = m_metricLearner;
            }
        }
    }

    setInstances(data);
    m_ClusterCentroids = new Instances(m_Instances, m_NumClusters);
    m_ClusterAssignments = new int[m_Instances.numInstances()];

    if (m_Instances.checkForNominalAttributes() && m_Instances.checkForStringAttributes()) {
        throw new UnsupportedAttributeTypeException("Cannot handle nominal attributes\n");
    }

    m_ClusterCentroids = m_Initializer.initialize();

    // if all instances are smoothed by the metric, the centroids
    // need to be smoothed too (note that this is independent of
    // centroid smoothing performed by K-Means)
    if (m_metric instanceof InstanceConverter) {
        System.out.println("Converting centroids...");
        Instances convertedCentroids = new Instances(m_ClusterCentroids, m_NumClusters);
        for (int i = 0; i < m_ClusterCentroids.numInstances(); i++) {
            Instance centroid = m_ClusterCentroids.instance(i);
            convertedCentroids.add(((InstanceConverter) m_metric).convertInstance(centroid));
        }

        m_ClusterCentroids.delete();
        for (int i = 0; i < convertedCentroids.numInstances(); i++) {
            m_ClusterCentroids.add(convertedCentroids.instance(i));
        }
    }

    System.out.println("Done initializing clustering ...");
    getIndexClusters();

    if (m_verbose && m_Seedable) {
        printIndexClusters();
        for (int i = 0; i < m_NumClusters; i++) {
            System.out.println("Centroid " + i + ": " + m_ClusterCentroids.instance(i));
        }
    }

    // Some extra work for smoothing metrics
    if (m_metric instanceof SmoothingMetric && ((SmoothingMetric) m_metric).getUseSmoothing()) {

        SmoothingMetric smoothingMetric = (SmoothingMetric) m_metric;
        Instances smoothedCentroids = new Instances(m_Instances, m_NumClusters);

        for (int i = 0; i < m_ClusterCentroids.numInstances(); i++) {
            Instance smoothedCentroid = smoothingMetric.smoothInstance(m_ClusterCentroids.instance(i));
            smoothedCentroids.add(smoothedCentroid);
        }
        m_ClusterCentroids = smoothedCentroids;

        updateSmoothingMetrics();
    }

    runKMeans();
}

From source file:MPCKMeans.java

License:Open Source License

/** Sets training instances */
public void setInstances(Instances instances) {
    m_Instances = instances;/* w w  w.j  a v a  2s .  co m*/

    // create the checksum coefficients
    m_checksumCoeffs = new double[instances.numAttributes()];
    for (int i = 0; i < m_checksumCoeffs.length; i++) {
        m_checksumCoeffs[i] = m_RandomNumberGenerator.nextDouble();
    }

    // hash the instance checksums
    m_checksumHash = new HashMap(instances.numInstances());
    int classIdx = instances.classIndex();
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance instance = instances.instance(i);
        double[] values = instance.toDoubleArray();
        double checksum = 0;

        for (int j = 0; j < values.length; j++) {
            if (j != classIdx) {
                checksum += m_checksumCoeffs[j] * values[j];
            }
        }

        // take care of chaining
        Object list = m_checksumHash.get(new Double((float) checksum));
        ArrayList idxList = null;
        if (list == null) {
            idxList = new ArrayList();
            m_checksumHash.put(new Double((float) checksum), idxList);
        } else { // chaining
            idxList = (ArrayList) list;
        }
        idxList.add(new Integer(i));
    }
}