List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:gr.uoc.nlp.opinion.dispatcher.HttpEventHandler.java
private JSONObject classifyArguments(MysqlConnect mysql, POSTagDocuments parse, ParseXML xml) throws IOException { //create csv file in temp folder xml.createArgumentsCSVFile(parse.getDir()); //get file names String src = parse.getDir() + xml.getCommentName() + "_arguments.csv"; String dst = parse.getDir() + xml.getCommentName() + "_arguments.arff"; //create .arff file for weka FileConversion.csvToArff(src, dst);//from w w w .j a v a 2s . c o m //initialize argument extraction class AnalyzeArguments anar = new AnalyzeArguments(mysql); //create set with unclassified sentences Instances unclassified = new Instances(new BufferedReader(new FileReader(dst))); Instances classified = anar.classify(anar.classifier("smo"), unclassified); //create empty JSONObject JSONObject json = new JSONObject(); //append results to JSONObject JSONArray jlist = new JSONArray(); for (int i = 0; i < classified.numInstances(); i++) { JSONObject arg = new JSONObject(); arg.put("s", classified.instance(i).toStringNoWeight()); jlist.add(arg); } JSONObject arguments = new JSONObject(); arguments.put("sentences", jlist); // arguments.put("classifier_output", anar.getClassifierOutput()); json.put("arguments", arguments); return json; }
From source file:gr.uoc.nlp.opinion.dispatcher.HttpEventHandler.java
private JSONObject classifySuggestions(JSONObject json, MysqlConnect mysql, POSTagDocuments parse, ParseXML xml) throws IOException { //create csv file in temp folder xml.createSuggestionsCSVFile(parse.getDir(), ElasticSearch); //get file names String src = parse.getDir() + xml.getCommentName() + "_suggestions.csv"; String dst = parse.getDir() + xml.getCommentName() + "_suggestions.arff"; //create .arff file for weka FileConversion.csvToArff(src, dst);/*from w w w. j a va2 s .c om*/ //initialize suggestion extraction class AnalyzeSuggestions ansu = new AnalyzeSuggestions(mysql); //create set with unclassified sentences Instances unclassified = new Instances(new BufferedReader(new FileReader(dst))); Instances classified = ansu.classify(ansu.classifier("randomforest"), unclassified); JSONArray jlist = new JSONArray(); for (int i = 0; i < classified.numInstances(); i++) { JSONObject arg = new JSONObject(); arg.put("s", classified.instance(i).toStringNoWeight()); jlist.add(arg); } //create empty JSONObject JSONObject suggestions = new JSONObject(); //append results to JSONObject suggestions.put("sentences", jlist); // suggestions.put("classifier_output", ansu.getClassifierOutput()); json.put("suggestions", suggestions); return json; }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * Select only instances with weights that contribute to * the specified quantile of the weight distribution * * @param data the input instances// ww w. ja v a 2 s. c o m * @param quantile the specified quantile eg 0.9 to select * 90% of the weight mass * @return the selected instances */ protected Instances selectWeightQuantile(Instances data, double quantile) { int numInstances = data.numInstances(); Instances trainData = new Instances(data, numInstances); double[] weights = new double[numInstances]; double sumOfWeights = 0; for (int i = 0; i < numInstances; i++) { weights[i] = data.instance(i).weight(); sumOfWeights += weights[i]; } double weightMassToSelect = sumOfWeights * quantile; int[] sortedIndices = Utils.sort(weights); // Select the instances sumOfWeights = 0; for (int i = numInstances - 1; i >= 0; i--) { Instance instance = (Instance) data.instance(sortedIndices[i]).copy(); trainData.add(instance); sumOfWeights += weights[sortedIndices[i]]; if ((sumOfWeights > weightMassToSelect) && (i > 0) && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) { break; } } if (m_Debug) { System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances); } return trainData; }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * /* ww w .j ava 2 s .c o m*/ * nMajnMin * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int nMaj, int nMin, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // the new data-set contains samples_min + samples_min * N / 100 int size = nMaj + nMin; //selected = new int[size]; // we store the selected examples indexes String majClassName = copia.attribute(copia.classIndex()).value(majC); Instances myDataset = new Instances(copia, 0); int nData = 0; for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } if (minCount <= 0) return copia; /* random undersampling of the majority */ //boolean[] taken = new boolean[copia.numInstances()]; int r; if (nMaj == majCount) { //System.out.println("#equal"); for (int i = 0; i < nMaj; i++) { myDataset.add(copia.instance(majExamples[i])); } } else { for (int i = 0; i < nMaj; i++) { r = simplingRandom.nextInt(majCount); //selected[nData] = majExamples[r]; myDataset.add(copia.instance(majExamples[r])); //taken[majExamples[r]] = true; } } for (int i = 0; i < nMin; i++) { r = simplingRandom.nextInt(minCount); //System.out.print("_"+r); //selected[nData] = minExamples[r]; myDataset.add(copia.instance(minExamples[r])); //taken[minExamples[r]] = true; } //System.out.println(); //System.out.println("minC="+minCount+"; majC="+majCount); myDataset.randomize(simplingRandom); return myDataset; }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * Boosting method. Boosts using resampling * * @param data the training data to be used for generating the * boosted classifier.//from w w w .j a va2s .c o m * @throws Exception if the classifier could not be built successfully */ protected void buildClassifierUsingResampling(Instances data) throws Exception { Instances trainData, sample, training; double epsilon, reweight, sumProbs; Evaluation evaluation; int numInstances = data.numInstances(); Random randomInstance = new Random(m_Seed); int resamplingIterations = 0; // Initialize data m_Betas = new double[m_Classifiers.length]; m_NumIterationsPerformed = 0; // Create a copy of the data so that when the weights are diddled // with it doesn't mess up the weights for anyone else training = new Instances(data, 0, numInstances); sumProbs = training.sumOfWeights(); for (int i = 0; i < training.numInstances(); i++) { training.instance(i).setWeight(training.instance(i).weight() / sumProbs); } // Do boostrap iterations for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) { if (m_Debug) { System.err.println("Training classifier " + (m_NumIterationsPerformed + 1)); } // Select instances to train the classifier on if (m_WeightThreshold < 100) { trainData = selectWeightQuantile(training, (double) m_WeightThreshold / 100); } else { trainData = new Instances(training); } // Resample resamplingIterations = 0; double[] weights = new double[trainData.numInstances()]; for (int i = 0; i < weights.length; i++) { weights[i] = trainData.instance(i).weight(); } do { sample = trainData.resampleWithWeights(randomInstance, weights); // int classNum[] = sample.attributeStats(sample.classIndex()).nominalCounts; int minC, nMin = classNum[0]; int majC, nMaj = classNum[1]; if (nMin < nMaj) { minC = 0; majC = 1; } else { minC = 1; majC = 0; nMin = classNum[1]; nMaj = classNum[0]; } //System.out.println("minC="+nMin+"; majC="+nMaj); /* * balance the data which boosting generate for training base classifier */ //System.out.println("before:"+classNum[0]+"-"+classNum[1]); Instances sampleData = randomSampling(sample, majC, minC, nMaj, nMaj, randomInstance); //classNum =sampleData.attributeStats(sampleData.classIndex()).nominalCounts; //System.out.println("after:"+classNum[0]+"-"+classNum[1]); // Build and evaluate classifier m_Classifiers[m_NumIterationsPerformed].buildClassifier(sampleData); evaluation = new Evaluation(data); evaluation.evaluateModel(m_Classifiers[m_NumIterationsPerformed], training); epsilon = evaluation.errorRate(); resamplingIterations++; } while (Utils.eq(epsilon, 0) && (resamplingIterations < MAX_NUM_RESAMPLING_ITERATIONS)); // Stop if error too big or 0 if (Utils.grOrEq(epsilon, 0.5) || Utils.eq(epsilon, 0)) { if (m_NumIterationsPerformed == 0) { m_NumIterationsPerformed = 1; // If we're the first we have to to use it } break; } // Determine the weight to assign to this model m_Betas[m_NumIterationsPerformed] = Math.log((1 - epsilon) / epsilon); reweight = (1 - epsilon) / epsilon; if (m_Debug) { System.err.println("\terror rate = " + epsilon + " beta = " + m_Betas[m_NumIterationsPerformed]); } // Update instance weights setWeights(training, reweight); } }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * Boosting method. Boosts any classifier that can handle weighted * instances.// ww w. ja va 2 s . co m * * @param data the training data to be used for generating the * boosted classifier. * @throws Exception if the classifier could not be built successfully */ protected void buildClassifierWithWeights(Instances data) throws Exception { Instances trainData, training; double epsilon, reweight; Evaluation evaluation; int numInstances = data.numInstances(); Random randomInstance = new Random(m_Seed); // Initialize data m_Betas = new double[m_Classifiers.length]; m_NumIterationsPerformed = 0; // Create a copy of the data so that when the weights are diddled // with it doesn't mess up the weights for anyone else training = new Instances(data, 0, numInstances); // Do boostrap iterations for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) { if (m_Debug) { System.err.println("Training classifier " + (m_NumIterationsPerformed + 1)); } // Select instances to train the classifier on if (m_WeightThreshold < 100) { trainData = selectWeightQuantile(training, (double) m_WeightThreshold / 100); } else { trainData = new Instances(training, 0, numInstances); } // Build the classifier if (m_Classifiers[m_NumIterationsPerformed] instanceof Randomizable) ((Randomizable) m_Classifiers[m_NumIterationsPerformed]).setSeed(randomInstance.nextInt()); // this is the training data for building base classifier, m_Classifiers[m_NumIterationsPerformed].buildClassifier(trainData); // Evaluate the classifier evaluation = new Evaluation(data); evaluation.evaluateModel(m_Classifiers[m_NumIterationsPerformed], training); epsilon = evaluation.errorRate(); // Stop if error too small or error too big and ignore this model if (Utils.grOrEq(epsilon, 0.5) || Utils.eq(epsilon, 0)) { if (m_NumIterationsPerformed == 0) { m_NumIterationsPerformed = 1; // If we're the first we have to to use it } break; } // Determine the weight to assign to this model m_Betas[m_NumIterationsPerformed] = Math.log((1 - epsilon) / epsilon); reweight = (1 - epsilon) / epsilon; if (m_Debug) { System.err.println("\terror rate = " + epsilon + " beta = " + m_Betas[m_NumIterationsPerformed]); } // Update instance weights setWeights(training, reweight); } }
From source file:gyc.SMOTEBagging.java
License:Open Source License
/** * Creates a new dataset of the same size using random sampling * with replacement according to the given weight vector. The * weights of the instances in the new dataset are set to one. * The length of the weight vector has to be the same as the * number of instances in the dataset, and all weights have to * be positive./* w w w . j a v a 2 s . co m*/ * * @param data the data to be sampled from * @param random a random number generator * @param sampled indicating which instance has been sampled * @return the new dataset * @throws IllegalArgumentException if the weights array is of the wrong * length or contains negative weights. */ public final Instances resampleWithWeights(Instances data, Random random, boolean[] sampled) { double[] weights = new double[data.numInstances()]; for (int i = 0; i < weights.length; i++) { weights[i] = data.instance(i).weight(); } Instances newData = new Instances(data, data.numInstances()); if (data.numInstances() == 0) { return newData; } double[] probabilities = new double[data.numInstances()]; double sumProbs = 0, sumOfWeights = Utils.sum(weights); for (int i = 0; i < data.numInstances(); i++) { sumProbs += random.nextDouble(); probabilities[i] = sumProbs; } Utils.normalize(probabilities, sumProbs / sumOfWeights); // Make sure that rounding errors don't mess things up probabilities[data.numInstances() - 1] = sumOfWeights; int k = 0; int l = 0; sumProbs = 0; while ((k < data.numInstances() && (l < data.numInstances()))) { if (weights[l] < 0) { throw new IllegalArgumentException("Weights have to be positive."); } sumProbs += weights[l]; while ((k < data.numInstances()) && (probabilities[k] <= sumProbs)) { newData.add(data.instance(l)); sampled[l] = true; newData.instance(k).setWeight(1); k++; } l++; } return newData; }
From source file:gyc.SMOTEBagging.java
License:Open Source License
/** * /*w w w .j a v a 2 s .c om*/ * 100%majminSMOTE (k, a). * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // resample min at rate (Nmaj/Nmin)*a% int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100; // class name String majClassName = copia.attribute(copia.classIndex()).value(majC); for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } /* random undersampling of the majority */ Instances myDataset = new Instances(copia, 0); int r; //100%majC for (int i = 0; i < majCount; i++) { myDataset.add(copia.instance(majExamples[i])); } if (minCount == 0) return myDataset; //(Nmaj/Nmin)*a% minC for (int i = 0; i < size; i++) { r = simplingRandom.nextInt(minCount); myDataset.add(copia.instance(minExamples[r])); } myDataset.randomize(simplingRandom); if (size == 1) { try { //neighbor Resample filter = new Resample(); filter.setInputFormat(myDataset); filter.setBiasToUniformClass(1.0); filter.setRandomSeed(simplingRandom.nextInt()); myDataset = Filter.useFilter(myDataset, filter); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (size > 1) { try { SMOTE filter = new SMOTE(); filter.setInputFormat(myDataset); // filter capabilities are checked here //data. double value = 100.0 * majCount / size - 100; //Percentage filter.setPercentage(value); //if (nMin<5) filter.setNearestNeighbors(nMin); filter.setRandomSeed(simplingRandom.nextInt()); //filterSMOTESMOTE myDataset = Filter.useFilter(myDataset, filter); //t.stop(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } return myDataset; }
From source file:gyc.SMOTEBagging.java
License:Open Source License
/** * Bagging method.// w ww . j a v a2s . c o m * * @param data the training data to be used for generating the * bagged classifier. * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); super.buildClassifier(data); if (m_CalcOutOfBag && (m_BagSizePercent != 100)) { throw new IllegalArgumentException( "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!"); } int bagSize = data.numInstances() * m_BagSizePercent / 100; Random random = new Random(m_Seed); boolean[][] inBag = null; if (m_CalcOutOfBag) inBag = new boolean[m_Classifiers.length][]; int b = 0; for (int j = 0; j < m_Classifiers.length; j++) { // int classNum[] = data.attributeStats(data.classIndex()).nominalCounts; int minC, nMin = classNum[0]; int majC, nMaj = classNum[1]; if (nMin < nMaj) { minC = 0; majC = 1; } else { minC = 1; majC = 0; nMin = classNum[1]; nMaj = classNum[0]; } b = b + 10; Instances bagData = randomSampling(data, majC, minC, b, random); /* // create the in-bag dataset if (m_CalcOutOfBag) { inBag[j] = new boolean[data.numInstances()]; bagData = resampleWithWeights(data, random, inBag[j]); } else { bagData = data.resampleWithWeights(random); if (bagSize < data.numInstances()) { bagData.randomize(random); Instances newBagData = new Instances(bagData, 0, bagSize); bagData = newBagData; } } if (m_Classifier instanceof Randomizable) { ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt()); }*/ // build the classifier m_Classifiers[j].buildClassifier(bagData); //classNum=bagData.attributeStats(bagData.classIndex()).nominalCounts; //System.out.println("after:"+classNum[0]+"-"+classNum[1]); } // calc OOB error? if (getCalcOutOfBag()) { double outOfBagCount = 0.0; double errorSum = 0.0; boolean numeric = data.classAttribute().isNumeric(); for (int i = 0; i < data.numInstances(); i++) { double vote; double[] votes; if (numeric) votes = new double[1]; else votes = new double[data.numClasses()]; // determine predictions for instance int voteCount = 0; for (int j = 0; j < m_Classifiers.length; j++) { if (inBag[j][i]) continue; voteCount++; double pred = m_Classifiers[j].classifyInstance(data.instance(i)); if (numeric) votes[0] += pred; else votes[(int) pred]++; } // "vote" if (numeric) { vote = votes[0]; if (voteCount > 0) { vote /= voteCount; // average } } else { vote = Utils.maxIndex(votes); // majority vote } // error for instance outOfBagCount += data.instance(i).weight(); if (numeric) { errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight(); } else { if (vote != data.instance(i).classValue()) errorSum += data.instance(i).weight(); } } m_OutOfBagError = errorSum / outOfBagCount; } else { m_OutOfBagError = 0; } }
From source file:gyc.UnderOverBoostM1.java
License:Open Source License
/** * /* www .j a v a 2 s. co m*/ * nMajnMin * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // the new data-set contains samples_min + samples_min * N / 100 int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100 * 2; // class name String majClassName = copia.attribute(copia.classIndex()).value(majC); for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } /* random undersampling of the majority */ Instances myDataset = new Instances(copia, 0); int r; for (int i = 0; i < size / 2; i++) { r = simplingRandom.nextInt(majCount); myDataset.add(copia.instance(majExamples[r])); if (minCount > 0) { r = simplingRandom.nextInt(minCount); myDataset.add(copia.instance(minExamples[r])); } } myDataset.randomize(simplingRandom); return myDataset; }