List of usage examples for weka.core Utils mean
public staticdouble mean(double[] vector)
From source file:classif.ExperimentsLauncher.java
License:Open Source License
public void launchFSKMeans() { try {/*from w w w .j a v a 2s.c o m*/ // File f = new File(rep + "/" + dataName + "_results.csv"); // // if somebody is processing it // if (f.exists()) { // return; // } // // out = new PrintStream(new FileOutputStream(rep + "/FastKMeansDTW_" + dataName + "_results.csv", true)); // out.println("dataset,algorithm,nbPrototypes,testErrorRate,trainErrorRate"); String algo = "FastKMEANS"; System.out.println(algo); // PrintStream outProto = new PrintStream(new FileOutputStream(rep + "/" + dataName + "_KMEANS.proto", append)); nbPrototypesMax = this.train.numInstances() / this.train.numClasses(); // if (nbPrototypesMax>10) nbPrototypesMax = 50; int tmp; tmp = nbExp; double[] avgerror = new double[5]; double[] avgf1 = new double[5]; // double[] trainrctmp = new double[5]; // double[] testrctmp = new double[5]; // double[] cvrctmp = new double[5]; // boolean stopflag=false; for (int j = 1; j <= nbPrototypesMax; j++) { // double[] trainrc = new double[5]; // double[] testrc = new double[5]; // double[] cvrc = new double[5]; if (j == 1) nbExp = 1; else nbExp = tmp; System.out.println("nbPrototypes=" + j); for (int n = 0; n < nbExp; n++) { // System.out.println("This is the "+n+" time."); DTWKNNClassifierKMeansCached classifierKMeans = new DTWKNNClassifierKMeansCached(); classifierKMeans.setNbPrototypesPerClass(j); classifierKMeans.setFillPrototypes(true); startTime = System.currentTimeMillis(); classifierKMeans.buildClassifier(train); endTime = System.currentTimeMillis(); duration = endTime - startTime; int[] classDistrib = PrototyperUtil .getPrototypesPerClassDistribution(classifierKMeans.prototypes, train); Evaluation evaltest = new Evaluation(train); evaltest.evaluateModel(classifierKMeans, test); avgerror[n] = evaltest.errorRate(); avgf1[n] = evaltest.fMeasure(0); // Evaluation evaltrain = new Evaluation(train); // evaltrain.evaluateModel(classifierKMeans, train); /*DTWKNNClassifierKMeansCached KMeans = new DTWKNNClassifierKMeansCached(); KMeans.setNbPrototypesPerClass(j); KMeans.setFillPrototypes(true); Evaluation evalcv = new Evaluation(train); Random rand = new Random(1); evalcv.crossValidateModel(KMeans, train, 10, rand); double CVError = evalcv.errorRate(); System.out.println("CVError:"+CVError+"\n");*/ // PrototyperUtil.savePrototypes(classifierKMeans.prototypes, rep + "/" + dataName + "_KMEANS[" + j + "]_XP" + n + ".proto"); // out.format("%s,%s,%d,%.4f,%.4f,%.4f\n", dataName, algo, (j * train.numClasses()), testError,CVError,trainError); // out.flush(); // trainrc[n]=trainError; // testrc[n]=testError; // cvrc[n]=CVError; // if (n == 4) { // if (j == 1) { // trainrctmp = trainrc; // testrctmp = testrc; // cvrctmp = cvrc; // } else { // if (Arrays.equals(trainrc, trainrctmp) && Arrays.equals(testrc, testrctmp) // && Arrays.equals(cvrc, cvrctmp)) { // System.out.println("Stable at " + j); // stopflag=true; // } else { // trainrctmp = trainrc; // testrctmp = testrc; // cvrctmp = cvrc; // } // } // } } System.out .println("TestError:" + Utils.mean(avgerror) + "\tF-Measures:" + Utils.mean(avgf1) + "\n"); // if(stopflag==true) // break; } // outProto.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:classif.ExperimentsLauncher.java
License:Open Source License
public void launchPUKMeans() { try {// w w w. ja va2 s. c o m // File f = new File(rep + "/" + dataName + "_results.csv"); // // if somebody is processing it // if (f.exists()) { // return; // } // // out = new PrintStream(new FileOutputStream(rep + "/KMeansDTW_" + "all" + "_results.csv", true)); // out.println("dataset,algorithm,nbPrototypes,testErrorRate,trainErrorRate"); String algo = "PUKMEANS"; System.out.println(algo); // PrintStream outProto = new PrintStream(new FileOutputStream(rep + "/" + dataName + "_KMEANS.proto", append)); nbPrototypesMax = this.train.numInstances() / this.train.numClasses(); if (nbPrototypesMax > 100) nbPrototypesMax = 50; int tmp; tmp = nbExp; double[] avgerror = new double[5]; double[] avgf1 = new double[5]; for (int j = 1; j <= nbPrototypesMax; j += 1) { if (j == 1) nbExp = 1; else nbExp = tmp; System.out.println("nbPrototypes=" + j); for (int n = 0; n < nbExp; n++) { // System.out.println("This is the "+n+" time."); DTWPUKMeans classifierKMeans = new DTWPUKMeans(); classifierKMeans.setNbClustersinUNL(j); startTime = System.currentTimeMillis(); classifierKMeans.buildClassifier(train); endTime = System.currentTimeMillis(); duration = endTime - startTime; // Duration traintime = Duration.ofMillis(duration); // System.out.println(traintime); Evaluation eval = new Evaluation(train); eval.evaluateModel(classifierKMeans, test); avgerror[n] = eval.errorRate(); avgf1[n] = eval.fMeasure(0); // PrototyperUtil.savePrototypes(classifierKMeans.prototypes, rep + "/" + dataName + "_KMEANS[" + j + "]_XP" + n + ".proto"); // out.format("%s,%s,%d,%.4f\n", dataName, algo, (j * train.numClasses()), testError); // out.flush(); } System.out .println("TestError:" + Utils.mean(avgerror) + "\tF-Measures:" + Utils.mean(avgf1) + "\n"); } // outProto.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:com.evaluation.ConfidenceLabelBasedMeasures.java
License:Open Source License
private void computeMeasures(MultiLabelOutput[] output, boolean[][] trueLabels) { int numLabels = trueLabels[0].length; // AUC/*from www .j ava 2 s. c o m*/ FastVector[] m_Predictions = new FastVector[numLabels]; for (int j = 0; j < numLabels; j++) m_Predictions[j] = new FastVector(); FastVector all_Predictions = new FastVector(); int numInstances = output.length; for (int instanceIndex = 0; instanceIndex < numInstances; instanceIndex++) { double[] confidences = output[instanceIndex].getConfidences(); for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) { int classValue; boolean actual = trueLabels[instanceIndex][labelIndex]; if (actual) classValue = 1; else classValue = 0; double[] dist = new double[2]; dist[1] = confidences[labelIndex]; dist[0] = 1 - dist[1]; m_Predictions[labelIndex].addElement(new NominalPrediction(classValue, dist, 1)); all_Predictions.addElement(new NominalPrediction(classValue, dist, 1)); } } labelAUC = new double[numLabels]; for (int i = 0; i < numLabels; i++) { ThresholdCurve tc = new ThresholdCurve(); Instances result = tc.getCurve(m_Predictions[i], 1); labelAUC[i] = ThresholdCurve.getROCArea(result); } auc[Averaging.MACRO.ordinal()] = Utils.mean(labelAUC); ThresholdCurve tc = new ThresholdCurve(); Instances result = tc.getCurve(all_Predictions, 1); auc[Averaging.MICRO.ordinal()] = ThresholdCurve.getROCArea(result); }
From source file:com.evaluation.LabelBasedMeasures.java
License:Open Source License
private void computeMeasures(MultiLabelOutput[] output, boolean[][] trueLabels) { int numLabels = trueLabels[0].length; //Counters are doubles to avoid typecasting //when performing divisions. It makes the code a //little cleaner but: //TODO: run performance tests on counting with doubles falsePositives = new double[numLabels]; truePositives = new double[numLabels]; falseNegatives = new double[numLabels]; trueNegatives = new double[numLabels]; //Count TP, TN, FP, FN int numInstances = output.length; for (int instanceIndex = 0; instanceIndex < numInstances; instanceIndex++) { boolean[] bipartition = output[instanceIndex].getBipartition(); for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) { boolean actual = trueLabels[instanceIndex][labelIndex]; boolean predicted = bipartition[labelIndex]; if (actual && predicted) truePositives[labelIndex]++; else if (!actual && !predicted) trueNegatives[labelIndex]++; else if (predicted) falsePositives[labelIndex]++; else/* w w w. j a v a2s . c o m*/ falseNegatives[labelIndex]++; } } // Evaluation measures for individual labels labelAccuracy = new double[numLabels]; labelRecall = new double[numLabels]; labelPrecision = new double[numLabels]; labelFMeasure = new double[numLabels]; //Compute macro averaged measures for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) { labelAccuracy[labelIndex] = (truePositives[labelIndex] + trueNegatives[labelIndex]) / numInstances; labelRecall[labelIndex] = truePositives[labelIndex] + falseNegatives[labelIndex] == 0 ? 0 : truePositives[labelIndex] / (truePositives[labelIndex] + falseNegatives[labelIndex]); labelPrecision[labelIndex] = truePositives[labelIndex] + falsePositives[labelIndex] == 0 ? 0 : truePositives[labelIndex] / (truePositives[labelIndex] + falsePositives[labelIndex]); labelFMeasure[labelIndex] = computeF1Measure(labelPrecision[labelIndex], labelRecall[labelIndex]); } accuracy[Averaging.MACRO.ordinal()] = Utils.mean(labelAccuracy); recall[Averaging.MACRO.ordinal()] = Utils.mean(labelRecall); precision[Averaging.MACRO.ordinal()] = Utils.mean(labelPrecision); fMeasure[Averaging.MACRO.ordinal()] = Utils.mean(labelFMeasure); //Compute micro averaged measures double tp = Utils.sum(truePositives); double tn = Utils.sum(trueNegatives); double fp = Utils.sum(falsePositives); double fn = Utils.sum(falseNegatives); accuracy[Averaging.MICRO.ordinal()] = (tp + tn) / (numInstances * numLabels); recall[Averaging.MICRO.ordinal()] = tp + fn == 0 ? 0 : tp / (tp + fn); precision[Averaging.MICRO.ordinal()] = tp + fp == 0 ? 0 : tp / (tp + fp); fMeasure[Averaging.MICRO.ordinal()] = computeF1Measure(precision[Averaging.MICRO.ordinal()], recall[Averaging.MICRO.ordinal()]); }
From source file:com.github.polarisation.kea.main.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//from w ww. jav a 2 s .c o m */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } System.err.println("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Conmputes the feature values for a given phrase. *///from w w w. j a v a 2 s .co m private double[] featVals(String id, FastVector phraseInfo, boolean training, HashMap<String, Counter> hashKeysEval, HashMap<String, Counter> hashKeyphrases, int length, HashMap<String, FastVector> hash) { // Compute feature values Counter counterLocal = (Counter) phraseInfo.elementAt(1); double[] newInst = new double[m_NumFeatures + 1]; // Compute TFxIDF Counter counterGlobal = (Counter) m_Dictionary.get(id); double localVal = counterLocal.value(), globalVal = 0; if (counterGlobal != null) { globalVal = counterGlobal.value(); if (training) { globalVal = globalVal - 1; } } // Just devide by length to get approximation of probability // that phrase in document is our phrase // newInst[m_TfidfIndex] = (localVal / ((double)length)); newInst[m_TfidfIndex] = (localVal / ((double) length)) * (-Math.log((globalVal + 1) / ((double) m_NumDocs + 1))); // Compute first occurrence Counter counterFirst = (Counter) phraseInfo.elementAt(0); newInst[m_FirstOccurIndex] = (double) counterFirst.value() / (double) length; // Is keyphrase frequency attribute being used? if (m_KFused) { Counter keyphraseC = (Counter) m_KeyphraseDictionary.get(id); if ((training) && (hashKeyphrases != null) && (hashKeyphrases.containsKey(id))) { newInst[m_KeyFreqIndex] = keyphraseC.value() - 1; } else { if (keyphraseC != null) { newInst[m_KeyFreqIndex] = keyphraseC.value(); } else { newInst[m_KeyFreqIndex] = 0; } } } // Is term appearance attribute being used? if (m_STDEVfeature) { FastVector app = (FastVector) phraseInfo.elementAt(3); double[] vals = new double[app.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Counter) app.elementAt(i)).value() / (double) length; ; } double mean = Utils.mean(vals); double summ = 0.0; for (int i = 0; i < vals.length; i++) { double a = vals[i]; //log.info("Appearence " + i + " is at " + a); summ += (a - mean) * (a - mean); } double stdev = Math.sqrt(summ / (double) app.size()); newInst[m_STDEVIndex] = stdev; /* Using instead of STDEV feature a thesaurus based feature (experiment) if (m_Vocabulary.getRelated(id,"compositeOf") != null) { //log.info(m_Vocabulary.getOrig(id) + " is a composite!"); newInst[m_STDEVIndex] = 1.0; } else { newInst[m_STDEVIndex] = 0.0; } */ } // Is node degree attribute being used? if (m_NODEfeature) { Vector<String> idsRT = m_Vocabulary.getRelated(id); int intern = 0; if (idsRT != null) { for (int d = 0; d < idsRT.size(); d++) { if (hash.get(idsRT.elementAt(d)) != null) { intern++; } } } // log.info("Node feature for " + m_Vocabulary.getOrig(id) + " = " + intern); newInst[m_NodeIndex] = (double) intern; } // Is term length attribute being used? if (m_LENGTHfeature) { String original; if (m_vocabulary.equals("none")) { original = id; } else { original = m_Vocabulary.getOrig(id); } if (original == null) { log.info("problem with id " + id); newInst[m_LengthIndex] = 1.0; } else { String[] words = split(original, " "); newInst[m_LengthIndex] = (double) words.length; } } // Compute class value if (hashKeysEval == null) { // no author-assigned keyphrases newInst[m_NumFeatures] = Instance.missingValue(); } else if (!hashKeysEval.containsKey(id)) { newInst[m_NumFeatures] = 0; // Not a keyphrase // Experiment with giving phrases related to manually chosen one // higher values than to unrelated ones /*Vector related = (Vector)m_Vocabulary.getRelated(id); // if this id is related to one of the keyphrases, set its class value to 0.5 if (related != null) { Enumeration en = related.elements(); while (en.hasMoreElements()) { String relID = (String)en.nextElement(); if (hashKeysEval.containsKey(relID)) { newInst[m_NumFeatures] = 1; // Keyphrase } } } */ } else { //hashKeysEval.remove(id); //newInst[m_NumFeatures] = 1; // Keyphrase // Learning from multiple-indexer's data // log.info(m_Indexers); // log.info("Calculating class value with m_Indexers = " + m_Indexers); double c = (double) ((Counter) hashKeysEval.get(id)).value() / m_Indexers; newInst[m_NumFeatures] = c; // Keyphrase // Or simple learning from 1 indexer: // newInst[m_NumFeatures] = 1.0; // Keyphrase } return newInst; }
From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files/*ww w . j av a 2 s . com*/ */ public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception { Vector<Double> stats = new Vector<Double>(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } log.info("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration<String> elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { log.debug("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { log.debug("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { // My addition: to exclude low ranking phrases double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()); if (rank >= 0.00) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { log.debug("" + topRankedInstances[i]); } } } } if (numExtracted > 0) { if (m_debug) { log.debug("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); log.info("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:data.statistics.MLStatistics.java
License:Open Source License
/** * Computes the average of any IR vector. * /*w ww . java2 s . c o m*/ * @param IR * An IR vector previously computed * @return double */ public double averageIR(double[] IR) { return Utils.mean(IR); }
From source file:jjj.asap.sas.util.Calc.java
License:Open Source License
/** * @return an average of kappa scores.//from ww w . j ava2 s .c o m */ public static double kappa(double[] x) { double[] z = new double[x.length]; for (int i = 0; i < x.length; i++) { z[i] = 0.5 * Math.log((1.0 + x[i]) / (1.0 - x[i])); } double a = Utils.mean(z); double e = Math.exp(2.0 * a); return (e - 1) / (e + 1); }
From source file:kea.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files/*from www.j a va2s . com*/ */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Extract keyphrases Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i] .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i] .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); m_KEAFilter.batchFinished(); }