Example usage for weka.core Instances numInstances

List of usage examples for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances() 

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:gr.uoc.nlp.opinion.dispatcher.HttpEventHandler.java

private JSONObject classifyArguments(MysqlConnect mysql, POSTagDocuments parse, ParseXML xml)
        throws IOException {

    //create csv file in temp folder
    xml.createArgumentsCSVFile(parse.getDir());

    //get file names
    String src = parse.getDir() + xml.getCommentName() + "_arguments.csv";
    String dst = parse.getDir() + xml.getCommentName() + "_arguments.arff";

    //create .arff file for weka
    FileConversion.csvToArff(src, dst);//from  w w w .j  a  v a  2s .  c  o m

    //initialize argument extraction class
    AnalyzeArguments anar = new AnalyzeArguments(mysql);

    //create set with unclassified sentences
    Instances unclassified = new Instances(new BufferedReader(new FileReader(dst)));
    Instances classified = anar.classify(anar.classifier("smo"), unclassified);

    //create empty JSONObject
    JSONObject json = new JSONObject();

    //append results to JSONObject
    JSONArray jlist = new JSONArray();
    for (int i = 0; i < classified.numInstances(); i++) {
        JSONObject arg = new JSONObject();
        arg.put("s", classified.instance(i).toStringNoWeight());
        jlist.add(arg);
    }

    JSONObject arguments = new JSONObject();
    arguments.put("sentences", jlist);
    //        arguments.put("classifier_output", anar.getClassifierOutput());

    json.put("arguments", arguments);

    return json;
}

From source file:gr.uoc.nlp.opinion.dispatcher.HttpEventHandler.java

private JSONObject classifySuggestions(JSONObject json, MysqlConnect mysql, POSTagDocuments parse, ParseXML xml)
        throws IOException {

    //create csv file in temp folder
    xml.createSuggestionsCSVFile(parse.getDir(), ElasticSearch);

    //get file names
    String src = parse.getDir() + xml.getCommentName() + "_suggestions.csv";
    String dst = parse.getDir() + xml.getCommentName() + "_suggestions.arff";

    //create .arff file for weka
    FileConversion.csvToArff(src, dst);/*from w w w. j  a  va2  s .c  om*/

    //initialize suggestion extraction class
    AnalyzeSuggestions ansu = new AnalyzeSuggestions(mysql);

    //create set with unclassified sentences
    Instances unclassified = new Instances(new BufferedReader(new FileReader(dst)));
    Instances classified = ansu.classify(ansu.classifier("randomforest"), unclassified);

    JSONArray jlist = new JSONArray();
    for (int i = 0; i < classified.numInstances(); i++) {
        JSONObject arg = new JSONObject();
        arg.put("s", classified.instance(i).toStringNoWeight());
        jlist.add(arg);
    }

    //create empty JSONObject
    JSONObject suggestions = new JSONObject();
    //append results to JSONObject
    suggestions.put("sentences", jlist);
    //        suggestions.put("classifier_output", ansu.getClassifierOutput());

    json.put("suggestions", suggestions);

    return json;
}

From source file:gyc.OverBoostM1.java

License:Open Source License

/**
 * Select only instances with weights that contribute to 
 * the specified quantile of the weight distribution
 *
 * @param data the input instances//  ww w.  ja v  a 2 s. c  o  m
 * @param quantile the specified quantile eg 0.9 to select 
 * 90% of the weight mass
 * @return the selected instances
 */
protected Instances selectWeightQuantile(Instances data, double quantile) {

    int numInstances = data.numInstances();
    Instances trainData = new Instances(data, numInstances);
    double[] weights = new double[numInstances];

    double sumOfWeights = 0;
    for (int i = 0; i < numInstances; i++) {
        weights[i] = data.instance(i).weight();
        sumOfWeights += weights[i];
    }
    double weightMassToSelect = sumOfWeights * quantile;
    int[] sortedIndices = Utils.sort(weights);

    // Select the instances
    sumOfWeights = 0;
    for (int i = numInstances - 1; i >= 0; i--) {
        Instance instance = (Instance) data.instance(sortedIndices[i]).copy();
        trainData.add(instance);
        sumOfWeights += weights[sortedIndices[i]];
        if ((sumOfWeights > weightMassToSelect) && (i > 0)
                && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) {
            break;
        }
    }
    if (m_Debug) {
        System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances);
    }
    return trainData;
}

From source file:gyc.OverBoostM1.java

License:Open Source License

/**
 * /* ww  w  .j  ava 2 s  .c o  m*/
 * nMajnMin
 * @param data
 * @param i
 * @return
 */
protected Instances randomSampling(Instances copia, int majC, int minC, int nMaj, int nMin,
        Random simplingRandom) {
    int[] majExamples = new int[copia.numInstances()];
    int[] minExamples = new int[copia.numInstances()];
    int majCount = 0, minCount = 0;
    // First, we copy the examples from the minority class and save the indexes of the majority
    // the new data-set contains samples_min + samples_min * N / 100
    int size = nMaj + nMin;
    //selected = new int[size]; // we store the selected examples indexes

    String majClassName = copia.attribute(copia.classIndex()).value(majC);

    Instances myDataset = new Instances(copia, 0);
    int nData = 0;
    for (int i = 0; i < copia.numInstances(); i++) {
        if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) {
            // save index
            majExamples[majCount] = i;
            majCount++;
        } else {
            minExamples[minCount] = i;
            minCount++;
        }
    }
    if (minCount <= 0)
        return copia;
    /* random undersampling of the majority */
    //boolean[] taken = new boolean[copia.numInstances()];
    int r;
    if (nMaj == majCount) {
        //System.out.println("#equal");
        for (int i = 0; i < nMaj; i++) {
            myDataset.add(copia.instance(majExamples[i]));
        }
    } else {
        for (int i = 0; i < nMaj; i++) {
            r = simplingRandom.nextInt(majCount);
            //selected[nData] = majExamples[r];
            myDataset.add(copia.instance(majExamples[r]));
            //taken[majExamples[r]] = true;
        }
    }
    for (int i = 0; i < nMin; i++) {
        r = simplingRandom.nextInt(minCount);
        //System.out.print("_"+r);

        //selected[nData] = minExamples[r];
        myDataset.add(copia.instance(minExamples[r]));
        //taken[minExamples[r]] = true;
    }

    //System.out.println();
    //System.out.println("minC="+minCount+"; majC="+majCount);

    myDataset.randomize(simplingRandom);
    return myDataset;
}

From source file:gyc.OverBoostM1.java

License:Open Source License

/**
 * Boosting method. Boosts using resampling
 *
 * @param data the training data to be used for generating the
 * boosted classifier.//from   w  w  w  .j  a  va2s  .c  o m
 * @throws Exception if the classifier could not be built successfully
 */
protected void buildClassifierUsingResampling(Instances data) throws Exception {

    Instances trainData, sample, training;
    double epsilon, reweight, sumProbs;
    Evaluation evaluation;
    int numInstances = data.numInstances();
    Random randomInstance = new Random(m_Seed);
    int resamplingIterations = 0;

    // Initialize data
    m_Betas = new double[m_Classifiers.length];
    m_NumIterationsPerformed = 0;
    // Create a copy of the data so that when the weights are diddled
    // with it doesn't mess up the weights for anyone else
    training = new Instances(data, 0, numInstances);
    sumProbs = training.sumOfWeights();
    for (int i = 0; i < training.numInstances(); i++) {
        training.instance(i).setWeight(training.instance(i).weight() / sumProbs);
    }

    // Do boostrap iterations
    for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) {
        if (m_Debug) {
            System.err.println("Training classifier " + (m_NumIterationsPerformed + 1));
        }

        // Select instances to train the classifier on
        if (m_WeightThreshold < 100) {
            trainData = selectWeightQuantile(training, (double) m_WeightThreshold / 100);
        } else {
            trainData = new Instances(training);
        }

        // Resample
        resamplingIterations = 0;
        double[] weights = new double[trainData.numInstances()];
        for (int i = 0; i < weights.length; i++) {
            weights[i] = trainData.instance(i).weight();
        }
        do {
            sample = trainData.resampleWithWeights(randomInstance, weights);

            //
            int classNum[] = sample.attributeStats(sample.classIndex()).nominalCounts;
            int minC, nMin = classNum[0];
            int majC, nMaj = classNum[1];
            if (nMin < nMaj) {
                minC = 0;
                majC = 1;
            } else {
                minC = 1;
                majC = 0;
                nMin = classNum[1];
                nMaj = classNum[0];
            }
            //System.out.println("minC="+nMin+"; majC="+nMaj);
            /*
             * balance the data which boosting generate for training base classifier
            */
            //System.out.println("before:"+classNum[0]+"-"+classNum[1]);
            Instances sampleData = randomSampling(sample, majC, minC, nMaj, nMaj, randomInstance);
            //classNum =sampleData.attributeStats(sampleData.classIndex()).nominalCounts;
            //System.out.println("after:"+classNum[0]+"-"+classNum[1]);

            // Build and evaluate classifier
            m_Classifiers[m_NumIterationsPerformed].buildClassifier(sampleData);

            evaluation = new Evaluation(data);
            evaluation.evaluateModel(m_Classifiers[m_NumIterationsPerformed], training);
            epsilon = evaluation.errorRate();
            resamplingIterations++;
        } while (Utils.eq(epsilon, 0) && (resamplingIterations < MAX_NUM_RESAMPLING_ITERATIONS));

        // Stop if error too big or 0
        if (Utils.grOrEq(epsilon, 0.5) || Utils.eq(epsilon, 0)) {
            if (m_NumIterationsPerformed == 0) {
                m_NumIterationsPerformed = 1; // If we're the first we have to to use it
            }
            break;
        }

        // Determine the weight to assign to this model
        m_Betas[m_NumIterationsPerformed] = Math.log((1 - epsilon) / epsilon);
        reweight = (1 - epsilon) / epsilon;
        if (m_Debug) {
            System.err.println("\terror rate = " + epsilon + "  beta = " + m_Betas[m_NumIterationsPerformed]);
        }

        // Update instance weights
        setWeights(training, reweight);
    }
}

From source file:gyc.OverBoostM1.java

License:Open Source License

/**
 * Boosting method. Boosts any classifier that can handle weighted
 * instances.//  ww w. ja va 2 s  . co m
 *
 * @param data the training data to be used for generating the
 * boosted classifier.
 * @throws Exception if the classifier could not be built successfully
 */
protected void buildClassifierWithWeights(Instances data) throws Exception {

    Instances trainData, training;
    double epsilon, reweight;
    Evaluation evaluation;
    int numInstances = data.numInstances();
    Random randomInstance = new Random(m_Seed);

    // Initialize data
    m_Betas = new double[m_Classifiers.length];
    m_NumIterationsPerformed = 0;

    // Create a copy of the data so that when the weights are diddled
    // with it doesn't mess up the weights for anyone else
    training = new Instances(data, 0, numInstances);

    // Do boostrap iterations
    for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) {
        if (m_Debug) {
            System.err.println("Training classifier " + (m_NumIterationsPerformed + 1));
        }
        // Select instances to train the classifier on
        if (m_WeightThreshold < 100) {
            trainData = selectWeightQuantile(training, (double) m_WeightThreshold / 100);
        } else {
            trainData = new Instances(training, 0, numInstances);
        }

        // Build the classifier
        if (m_Classifiers[m_NumIterationsPerformed] instanceof Randomizable)
            ((Randomizable) m_Classifiers[m_NumIterationsPerformed]).setSeed(randomInstance.nextInt());

        // this is the training data for building base classifier, 
        m_Classifiers[m_NumIterationsPerformed].buildClassifier(trainData);

        // Evaluate the classifier
        evaluation = new Evaluation(data);
        evaluation.evaluateModel(m_Classifiers[m_NumIterationsPerformed], training);
        epsilon = evaluation.errorRate();

        // Stop if error too small or error too big and ignore this model
        if (Utils.grOrEq(epsilon, 0.5) || Utils.eq(epsilon, 0)) {
            if (m_NumIterationsPerformed == 0) {
                m_NumIterationsPerformed = 1; // If we're the first we have to to use it
            }
            break;
        }
        // Determine the weight to assign to this model
        m_Betas[m_NumIterationsPerformed] = Math.log((1 - epsilon) / epsilon);
        reweight = (1 - epsilon) / epsilon;
        if (m_Debug) {
            System.err.println("\terror rate = " + epsilon + "  beta = " + m_Betas[m_NumIterationsPerformed]);
        }

        // Update instance weights
        setWeights(training, reweight);
    }
}

From source file:gyc.SMOTEBagging.java

License:Open Source License

/**
 * Creates a new dataset of the same size using random sampling
 * with replacement according to the given weight vector. The
 * weights of the instances in the new dataset are set to one.
 * The length of the weight vector has to be the same as the
 * number of instances in the dataset, and all weights have to
 * be positive./* w w  w  .  j a  v a  2 s .  co  m*/
 *
 * @param data the data to be sampled from
 * @param random a random number generator
 * @param sampled indicating which instance has been sampled
 * @return the new dataset
 * @throws IllegalArgumentException if the weights array is of the wrong
 * length or contains negative weights.
 */
public final Instances resampleWithWeights(Instances data, Random random, boolean[] sampled) {

    double[] weights = new double[data.numInstances()];
    for (int i = 0; i < weights.length; i++) {
        weights[i] = data.instance(i).weight();
    }
    Instances newData = new Instances(data, data.numInstances());
    if (data.numInstances() == 0) {
        return newData;
    }
    double[] probabilities = new double[data.numInstances()];
    double sumProbs = 0, sumOfWeights = Utils.sum(weights);
    for (int i = 0; i < data.numInstances(); i++) {
        sumProbs += random.nextDouble();
        probabilities[i] = sumProbs;
    }
    Utils.normalize(probabilities, sumProbs / sumOfWeights);

    // Make sure that rounding errors don't mess things up
    probabilities[data.numInstances() - 1] = sumOfWeights;
    int k = 0;
    int l = 0;
    sumProbs = 0;
    while ((k < data.numInstances() && (l < data.numInstances()))) {
        if (weights[l] < 0) {
            throw new IllegalArgumentException("Weights have to be positive.");
        }
        sumProbs += weights[l];
        while ((k < data.numInstances()) && (probabilities[k] <= sumProbs)) {
            newData.add(data.instance(l));
            sampled[l] = true;
            newData.instance(k).setWeight(1);
            k++;
        }
        l++;
    }
    return newData;
}

From source file:gyc.SMOTEBagging.java

License:Open Source License

/**
 * /*w w w  .j  a v  a  2 s  .c  om*/
 * 100%majminSMOTE (k, a).
 * @param data
 * @param i
 * @return
 */
protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) {
    int[] majExamples = new int[copia.numInstances()];
    int[] minExamples = new int[copia.numInstances()];
    int majCount = 0, minCount = 0;
    // First, we copy the examples from the minority class and save the indexes of the majority
    // resample min at rate (Nmaj/Nmin)*a%
    int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100;
    // class name
    String majClassName = copia.attribute(copia.classIndex()).value(majC);

    for (int i = 0; i < copia.numInstances(); i++) {
        if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) {
            // save index
            majExamples[majCount] = i;
            majCount++;
        } else {
            minExamples[minCount] = i;
            minCount++;
        }
    }

    /* random undersampling of the majority */
    Instances myDataset = new Instances(copia, 0);
    int r;
    //100%majC
    for (int i = 0; i < majCount; i++) {
        myDataset.add(copia.instance(majExamples[i]));
    }
    if (minCount == 0)
        return myDataset;
    //(Nmaj/Nmin)*a% minC
    for (int i = 0; i < size; i++) {
        r = simplingRandom.nextInt(minCount);
        myDataset.add(copia.instance(minExamples[r]));
    }
    myDataset.randomize(simplingRandom);

    if (size == 1) {
        try {
            //neighbor
            Resample filter = new Resample();
            filter.setInputFormat(myDataset);
            filter.setBiasToUniformClass(1.0);
            filter.setRandomSeed(simplingRandom.nextInt());
            myDataset = Filter.useFilter(myDataset, filter);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    if (size > 1) {
        try {
            SMOTE filter = new SMOTE();
            filter.setInputFormat(myDataset); // filter capabilities are checked here
            //data.
            double value = 100.0 * majCount / size - 100;
            //Percentage
            filter.setPercentage(value);
            //if (nMin<5) filter.setNearestNeighbors(nMin);
            filter.setRandomSeed(simplingRandom.nextInt());
            //filterSMOTESMOTE
            myDataset = Filter.useFilter(myDataset, filter);
            //t.stop();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    return myDataset;
}

From source file:gyc.SMOTEBagging.java

License:Open Source License

/**
 * Bagging method.//  w ww . j  a  v  a2s .  c o  m
 *
 * @param data the training data to be used for generating the
 * bagged classifier.
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    super.buildClassifier(data);

    if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {
        throw new IllegalArgumentException(
                "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!");
    }

    int bagSize = data.numInstances() * m_BagSizePercent / 100;
    Random random = new Random(m_Seed);

    boolean[][] inBag = null;
    if (m_CalcOutOfBag)
        inBag = new boolean[m_Classifiers.length][];
    int b = 0;
    for (int j = 0; j < m_Classifiers.length; j++) {

        //
        int classNum[] = data.attributeStats(data.classIndex()).nominalCounts;
        int minC, nMin = classNum[0];
        int majC, nMaj = classNum[1];
        if (nMin < nMaj) {
            minC = 0;
            majC = 1;
        } else {
            minC = 1;
            majC = 0;
            nMin = classNum[1];
            nMaj = classNum[0];
        }

        b = b + 10;
        Instances bagData = randomSampling(data, majC, minC, b, random);

        /*      // create the in-bag dataset
              if (m_CalcOutOfBag) {
           inBag[j] = new boolean[data.numInstances()];
           bagData = resampleWithWeights(data, random, inBag[j]);
              } else {
           bagData = data.resampleWithWeights(random);
           if (bagSize < data.numInstances()) {
             bagData.randomize(random);
             Instances newBagData = new Instances(bagData, 0, bagSize);
             bagData = newBagData;
           }
              }
                      
              if (m_Classifier instanceof Randomizable) {
           ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt());
              }*/

        // build the classifier
        m_Classifiers[j].buildClassifier(bagData);
        //classNum=bagData.attributeStats(bagData.classIndex()).nominalCounts;
        //System.out.println("after:"+classNum[0]+"-"+classNum[1]);
    }

    // calc OOB error?
    if (getCalcOutOfBag()) {
        double outOfBagCount = 0.0;
        double errorSum = 0.0;
        boolean numeric = data.classAttribute().isNumeric();

        for (int i = 0; i < data.numInstances(); i++) {
            double vote;
            double[] votes;
            if (numeric)
                votes = new double[1];
            else
                votes = new double[data.numClasses()];

            // determine predictions for instance
            int voteCount = 0;
            for (int j = 0; j < m_Classifiers.length; j++) {
                if (inBag[j][i])
                    continue;

                voteCount++;
                double pred = m_Classifiers[j].classifyInstance(data.instance(i));
                if (numeric)
                    votes[0] += pred;
                else
                    votes[(int) pred]++;
            }

            // "vote"
            if (numeric) {
                vote = votes[0];
                if (voteCount > 0) {
                    vote /= voteCount; // average
                }
            } else {
                vote = Utils.maxIndex(votes); // majority vote
            }

            // error for instance
            outOfBagCount += data.instance(i).weight();
            if (numeric) {
                errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight();
            } else {
                if (vote != data.instance(i).classValue())
                    errorSum += data.instance(i).weight();
            }
        }

        m_OutOfBagError = errorSum / outOfBagCount;
    } else {
        m_OutOfBagError = 0;
    }
}

From source file:gyc.UnderOverBoostM1.java

License:Open Source License

/**
 * /* www .j  a v  a  2  s.  co m*/
 * nMajnMin
 * @param data
 * @param i
 * @return
 */
protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) {
    int[] majExamples = new int[copia.numInstances()];
    int[] minExamples = new int[copia.numInstances()];
    int majCount = 0, minCount = 0;
    // First, we copy the examples from the minority class and save the indexes of the majority
    // the new data-set contains samples_min + samples_min * N / 100
    int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100 * 2;
    // class name
    String majClassName = copia.attribute(copia.classIndex()).value(majC);

    for (int i = 0; i < copia.numInstances(); i++) {
        if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) {
            // save index
            majExamples[majCount] = i;
            majCount++;
        } else {
            minExamples[minCount] = i;
            minCount++;
        }
    }

    /* random undersampling of the majority */
    Instances myDataset = new Instances(copia, 0);
    int r;
    for (int i = 0; i < size / 2; i++) {
        r = simplingRandom.nextInt(majCount);
        myDataset.add(copia.instance(majExamples[r]));

        if (minCount > 0) {
            r = simplingRandom.nextInt(minCount);
            myDataset.add(copia.instance(minExamples[r]));
        }
    }

    myDataset.randomize(simplingRandom);
    return myDataset;
}