Example usage for weka.core Utils mean

List of usage examples for weka.core Utils mean

Introduction

In this page you can find the example usage for weka.core Utils mean.

Prototype

public staticdouble mean(double[] vector) 

Source Link

Document

Computes the mean for an array of doubles.

Usage

From source file:classif.ExperimentsLauncher.java

License:Open Source License

public void launchFSKMeans() {
    try {/*from   w  w w .j  a v  a  2s.c  o m*/
        //         File f = new File(rep + "/" + dataName + "_results.csv");
        //         // if somebody is processing it
        //         if (f.exists()) {
        //            return;
        //         }
        //
        //         out = new PrintStream(new FileOutputStream(rep + "/FastKMeansDTW_" + dataName + "_results.csv", true));
        //         out.println("dataset,algorithm,nbPrototypes,testErrorRate,trainErrorRate");
        String algo = "FastKMEANS";
        System.out.println(algo);
        //         PrintStream outProto = new PrintStream(new FileOutputStream(rep + "/" + dataName + "_KMEANS.proto", append));

        nbPrototypesMax = this.train.numInstances() / this.train.numClasses();
        //         if (nbPrototypesMax>10)
        nbPrototypesMax = 50;
        int tmp;
        tmp = nbExp;
        double[] avgerror = new double[5];
        double[] avgf1 = new double[5];
        //         double[] trainrctmp = new double[5];
        //         double[] testrctmp = new double[5];
        //         double[] cvrctmp = new double[5];
        //         boolean stopflag=false;
        for (int j = 1; j <= nbPrototypesMax; j++) {
            //            double[] trainrc = new double[5];
            //            double[] testrc = new double[5];
            //            double[] cvrc = new double[5];
            if (j == 1)
                nbExp = 1;
            else
                nbExp = tmp;
            System.out.println("nbPrototypes=" + j);
            for (int n = 0; n < nbExp; n++) {
                //               System.out.println("This is the "+n+" time.");
                DTWKNNClassifierKMeansCached classifierKMeans = new DTWKNNClassifierKMeansCached();
                classifierKMeans.setNbPrototypesPerClass(j);
                classifierKMeans.setFillPrototypes(true);

                startTime = System.currentTimeMillis();
                classifierKMeans.buildClassifier(train);
                endTime = System.currentTimeMillis();
                duration = endTime - startTime;

                int[] classDistrib = PrototyperUtil
                        .getPrototypesPerClassDistribution(classifierKMeans.prototypes, train);

                Evaluation evaltest = new Evaluation(train);
                evaltest.evaluateModel(classifierKMeans, test);
                avgerror[n] = evaltest.errorRate();
                avgf1[n] = evaltest.fMeasure(0);
                //               Evaluation evaltrain = new Evaluation(train);
                //               evaltrain.evaluateModel(classifierKMeans, train);

                /*DTWKNNClassifierKMeansCached KMeans = new DTWKNNClassifierKMeansCached();
                KMeans.setNbPrototypesPerClass(j);
                KMeans.setFillPrototypes(true);
                Evaluation evalcv = new Evaluation(train);
                Random rand = new Random(1);
                evalcv.crossValidateModel(KMeans, train, 10, rand);
                double CVError = evalcv.errorRate();
                System.out.println("CVError:"+CVError+"\n");*/

                //               PrototyperUtil.savePrototypes(classifierKMeans.prototypes, rep + "/" + dataName + "_KMEANS[" + j + "]_XP" + n + ".proto");

                //               out.format("%s,%s,%d,%.4f,%.4f,%.4f\n", dataName, algo, (j * train.numClasses()), testError,CVError,trainError);
                //               out.flush();
                //               trainrc[n]=trainError;
                //               testrc[n]=testError;
                //               cvrc[n]=CVError;
                //               if (n == 4) {
                //                  if (j == 1) {
                //                     trainrctmp = trainrc;
                //                     testrctmp = testrc;
                //                     cvrctmp = cvrc;
                //                  } else {
                //                     if (Arrays.equals(trainrc, trainrctmp) && Arrays.equals(testrc, testrctmp)
                //                           && Arrays.equals(cvrc, cvrctmp)) {
                //                        System.out.println("Stable at " + j);
                //                        stopflag=true;
                //                     } else {
                //                        trainrctmp = trainrc;
                //                        testrctmp = testrc;
                //                        cvrctmp = cvrc;
                //                     }
                //                  }
                //               }
            }
            System.out
                    .println("TestError:" + Utils.mean(avgerror) + "\tF-Measures:" + Utils.mean(avgf1) + "\n");
            //            if(stopflag==true)
            //               break;
        }
        //         outProto.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:classif.ExperimentsLauncher.java

License:Open Source License

public void launchPUKMeans() {
    try {//  w w  w. ja va2 s.  c  o m
        //         File f = new File(rep + "/" + dataName + "_results.csv");
        //         // if somebody is processing it
        //         if (f.exists()) {
        //            return;
        //         }
        //
        //         out = new PrintStream(new FileOutputStream(rep + "/KMeansDTW_" + "all" + "_results.csv", true));
        //         out.println("dataset,algorithm,nbPrototypes,testErrorRate,trainErrorRate");
        String algo = "PUKMEANS";
        System.out.println(algo);
        //         PrintStream outProto = new PrintStream(new FileOutputStream(rep + "/" + dataName + "_KMEANS.proto", append));

        nbPrototypesMax = this.train.numInstances() / this.train.numClasses();
        if (nbPrototypesMax > 100)
            nbPrototypesMax = 50;
        int tmp;
        tmp = nbExp;
        double[] avgerror = new double[5];
        double[] avgf1 = new double[5];

        for (int j = 1; j <= nbPrototypesMax; j += 1) {
            if (j == 1)
                nbExp = 1;
            else
                nbExp = tmp;
            System.out.println("nbPrototypes=" + j);
            for (int n = 0; n < nbExp; n++) {
                //               System.out.println("This is the "+n+" time.");
                DTWPUKMeans classifierKMeans = new DTWPUKMeans();
                classifierKMeans.setNbClustersinUNL(j);
                startTime = System.currentTimeMillis();
                classifierKMeans.buildClassifier(train);
                endTime = System.currentTimeMillis();
                duration = endTime - startTime;
                //               Duration traintime = Duration.ofMillis(duration);
                //               System.out.println(traintime);
                Evaluation eval = new Evaluation(train);
                eval.evaluateModel(classifierKMeans, test);
                avgerror[n] = eval.errorRate();
                avgf1[n] = eval.fMeasure(0);

                //               PrototyperUtil.savePrototypes(classifierKMeans.prototypes, rep + "/" + dataName + "_KMEANS[" + j + "]_XP" + n + ".proto");

                //               out.format("%s,%s,%d,%.4f\n", dataName, algo, (j * train.numClasses()), testError);
                //               out.flush();
            }
            System.out
                    .println("TestError:" + Utils.mean(avgerror) + "\tF-Measures:" + Utils.mean(avgf1) + "\n");
        }
        //         outProto.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.evaluation.ConfidenceLabelBasedMeasures.java

License:Open Source License

private void computeMeasures(MultiLabelOutput[] output, boolean[][] trueLabels) {
    int numLabels = trueLabels[0].length;

    // AUC/*from www  .j ava 2 s. c o  m*/
    FastVector[] m_Predictions = new FastVector[numLabels];
    for (int j = 0; j < numLabels; j++)
        m_Predictions[j] = new FastVector();
    FastVector all_Predictions = new FastVector();

    int numInstances = output.length;
    for (int instanceIndex = 0; instanceIndex < numInstances; instanceIndex++) {
        double[] confidences = output[instanceIndex].getConfidences();
        for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) {

            int classValue;
            boolean actual = trueLabels[instanceIndex][labelIndex];
            if (actual)
                classValue = 1;
            else
                classValue = 0;

            double[] dist = new double[2];
            dist[1] = confidences[labelIndex];
            dist[0] = 1 - dist[1];

            m_Predictions[labelIndex].addElement(new NominalPrediction(classValue, dist, 1));
            all_Predictions.addElement(new NominalPrediction(classValue, dist, 1));
        }
    }

    labelAUC = new double[numLabels];
    for (int i = 0; i < numLabels; i++) {
        ThresholdCurve tc = new ThresholdCurve();
        Instances result = tc.getCurve(m_Predictions[i], 1);
        labelAUC[i] = ThresholdCurve.getROCArea(result);
    }
    auc[Averaging.MACRO.ordinal()] = Utils.mean(labelAUC);
    ThresholdCurve tc = new ThresholdCurve();
    Instances result = tc.getCurve(all_Predictions, 1);
    auc[Averaging.MICRO.ordinal()] = ThresholdCurve.getROCArea(result);
}

From source file:com.evaluation.LabelBasedMeasures.java

License:Open Source License

private void computeMeasures(MultiLabelOutput[] output, boolean[][] trueLabels) {
    int numLabels = trueLabels[0].length;

    //Counters are doubles to avoid typecasting
    //when performing divisions. It makes the code a
    //little cleaner but:
    //TODO: run performance tests on counting with doubles
    falsePositives = new double[numLabels];
    truePositives = new double[numLabels];
    falseNegatives = new double[numLabels];
    trueNegatives = new double[numLabels];

    //Count TP, TN, FP, FN
    int numInstances = output.length;
    for (int instanceIndex = 0; instanceIndex < numInstances; instanceIndex++) {
        boolean[] bipartition = output[instanceIndex].getBipartition();

        for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) {
            boolean actual = trueLabels[instanceIndex][labelIndex];
            boolean predicted = bipartition[labelIndex];

            if (actual && predicted)
                truePositives[labelIndex]++;
            else if (!actual && !predicted)
                trueNegatives[labelIndex]++;
            else if (predicted)
                falsePositives[labelIndex]++;
            else/* w w  w. j  a  v a2s  . c  o  m*/
                falseNegatives[labelIndex]++;
        }
    }

    // Evaluation measures for individual labels
    labelAccuracy = new double[numLabels];
    labelRecall = new double[numLabels];
    labelPrecision = new double[numLabels];
    labelFMeasure = new double[numLabels];

    //Compute macro averaged measures
    for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) {
        labelAccuracy[labelIndex] = (truePositives[labelIndex] + trueNegatives[labelIndex]) / numInstances;

        labelRecall[labelIndex] = truePositives[labelIndex] + falseNegatives[labelIndex] == 0 ? 0
                : truePositives[labelIndex] / (truePositives[labelIndex] + falseNegatives[labelIndex]);

        labelPrecision[labelIndex] = truePositives[labelIndex] + falsePositives[labelIndex] == 0 ? 0
                : truePositives[labelIndex] / (truePositives[labelIndex] + falsePositives[labelIndex]);

        labelFMeasure[labelIndex] = computeF1Measure(labelPrecision[labelIndex], labelRecall[labelIndex]);
    }
    accuracy[Averaging.MACRO.ordinal()] = Utils.mean(labelAccuracy);
    recall[Averaging.MACRO.ordinal()] = Utils.mean(labelRecall);
    precision[Averaging.MACRO.ordinal()] = Utils.mean(labelPrecision);
    fMeasure[Averaging.MACRO.ordinal()] = Utils.mean(labelFMeasure);

    //Compute micro averaged measures
    double tp = Utils.sum(truePositives);
    double tn = Utils.sum(trueNegatives);
    double fp = Utils.sum(falsePositives);
    double fn = Utils.sum(falseNegatives);

    accuracy[Averaging.MICRO.ordinal()] = (tp + tn) / (numInstances * numLabels);
    recall[Averaging.MICRO.ordinal()] = tp + fn == 0 ? 0 : tp / (tp + fn);
    precision[Averaging.MICRO.ordinal()] = tp + fp == 0 ? 0 : tp / (tp + fp);
    fMeasure[Averaging.MICRO.ordinal()] = computeF1Measure(precision[Averaging.MICRO.ordinal()],
            recall[Averaging.MICRO.ordinal()]);
}

From source file:com.github.polarisation.kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//from  w ww.  jav a 2 s .c  o m
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.err.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.err.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:com.openkm.kea.filter.KEAFilter.java

License:Open Source License

/** 
 * Conmputes the feature values for a given phrase.
 *///from   w w w.  j  a v  a  2 s  .co m
private double[] featVals(String id, FastVector phraseInfo, boolean training,
        HashMap<String, Counter> hashKeysEval, HashMap<String, Counter> hashKeyphrases, int length,
        HashMap<String, FastVector> hash) {

    // Compute feature values
    Counter counterLocal = (Counter) phraseInfo.elementAt(1);
    double[] newInst = new double[m_NumFeatures + 1];

    // Compute TFxIDF
    Counter counterGlobal = (Counter) m_Dictionary.get(id);
    double localVal = counterLocal.value(), globalVal = 0;
    if (counterGlobal != null) {
        globalVal = counterGlobal.value();
        if (training) {
            globalVal = globalVal - 1;
        }
    }

    // Just devide by length to get approximation of probability
    // that phrase in document is our phrase
    // newInst[m_TfidfIndex] = (localVal / ((double)length));
    newInst[m_TfidfIndex] = (localVal / ((double) length))
            * (-Math.log((globalVal + 1) / ((double) m_NumDocs + 1)));

    // Compute first occurrence
    Counter counterFirst = (Counter) phraseInfo.elementAt(0);
    newInst[m_FirstOccurIndex] = (double) counterFirst.value() / (double) length;

    // Is keyphrase frequency attribute being used?
    if (m_KFused) {
        Counter keyphraseC = (Counter) m_KeyphraseDictionary.get(id);
        if ((training) && (hashKeyphrases != null) && (hashKeyphrases.containsKey(id))) {
            newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
        } else {
            if (keyphraseC != null) {
                newInst[m_KeyFreqIndex] = keyphraseC.value();
            } else {
                newInst[m_KeyFreqIndex] = 0;
            }
        }
    }

    // Is term appearance attribute being used?
    if (m_STDEVfeature) {
        FastVector app = (FastVector) phraseInfo.elementAt(3);

        double[] vals = new double[app.size()];
        for (int i = 0; i < vals.length; i++) {
            vals[i] = ((Counter) app.elementAt(i)).value() / (double) length;
            ;
        }

        double mean = Utils.mean(vals);
        double summ = 0.0;
        for (int i = 0; i < vals.length; i++) {
            double a = vals[i];
            //log.info("Appearence " + i + " is at " + a);
            summ += (a - mean) * (a - mean);
        }
        double stdev = Math.sqrt(summ / (double) app.size());

        newInst[m_STDEVIndex] = stdev;

        /* Using instead of STDEV feature a thesaurus based feature (experiment)
         if (m_Vocabulary.getRelated(id,"compositeOf") != null) {
         //log.info(m_Vocabulary.getOrig(id) + " is a composite!");
          newInst[m_STDEVIndex] = 1.0;
          } else {
          newInst[m_STDEVIndex] = 0.0;
          }
          */

    }

    // Is node degree attribute being used?   
    if (m_NODEfeature) {

        Vector<String> idsRT = m_Vocabulary.getRelated(id);

        int intern = 0;
        if (idsRT != null) {
            for (int d = 0; d < idsRT.size(); d++) {
                if (hash.get(idsRT.elementAt(d)) != null) {
                    intern++;
                }
            }
        }
        // log.info("Node feature for " + m_Vocabulary.getOrig(id) + " = " + intern);

        newInst[m_NodeIndex] = (double) intern;

    }

    // Is term length attribute being used?
    if (m_LENGTHfeature) {
        String original;
        if (m_vocabulary.equals("none")) {
            original = id;
        } else {
            original = m_Vocabulary.getOrig(id);
        }
        if (original == null) {
            log.info("problem with id " + id);
            newInst[m_LengthIndex] = 1.0;
        } else {
            String[] words = split(original, " ");
            newInst[m_LengthIndex] = (double) words.length;
        }

    }

    // Compute class value

    if (hashKeysEval == null) { // no author-assigned keyphrases
        newInst[m_NumFeatures] = Instance.missingValue();
    } else if (!hashKeysEval.containsKey(id)) {

        newInst[m_NumFeatures] = 0; // Not a keyphrase

        // Experiment with giving phrases related to manually chosen one
        // higher values than to unrelated ones
        /*Vector related = (Vector)m_Vocabulary.getRelated(id);
         // if this id is related to one of the keyphrases, set its class value to 0.5
          if (related != null) {         
          Enumeration en = related.elements();         
          while (en.hasMoreElements()) {
          String relID = (String)en.nextElement();
          if (hashKeysEval.containsKey(relID)) {      
          newInst[m_NumFeatures] = 1; // Keyphrase
          }            
          }   
          }
          */

    } else {
        //hashKeysEval.remove(id);
        //newInst[m_NumFeatures] = 1; // Keyphrase

        // Learning from multiple-indexer's data
        // log.info(m_Indexers);
        // log.info("Calculating class value with m_Indexers = " + m_Indexers);

        double c = (double) ((Counter) hashKeysEval.get(id)).value() / m_Indexers;
        newInst[m_NumFeatures] = c; // Keyphrase

        // Or simple learning from 1 indexer:
        // newInst[m_NumFeatures] = 1.0; // Keyphrase
    }
    return newInst;
}

From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*ww w .  j av a  2  s . com*/
 */
public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception {
    Vector<Double> stats = new Vector<Double>();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    log.info("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration<String> elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                log.debug("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                log.debug("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            log.debug("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            log.debug("-- Keyphrases and feature values:");
        }

        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                // My addition: to exclude low ranking phrases
                double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex());

                if (rank >= 0.00) {
                    if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                        numExtracted += 1.0;
                    }
                    if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                        numCorrect += 1.0;
                    }
                    if (printer != null) {
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                        if (m_AdditionalInfo) {
                            printer.print("\t");
                            printer.print(
                                    topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                            printer.print("\t");
                            printer.print(Utils.doubleToString(
                                    topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                        }
                        printer.println();
                    }
                    if (m_debug) {
                        log.debug("" + topRankedInstances[i]);
                    }
                }
            }
        }

        if (numExtracted > 0) {
            if (m_debug) {
                log.debug("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }

        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }

    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2)
            + " +/- " + Utils.doubleToString(stdDev, 2));
    log.info("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:data.statistics.MLStatistics.java

License:Open Source License

/**
 * Computes the average of any IR vector.
 * /*w ww  . java2 s . c o m*/
 * @param IR
 *            An IR vector previously computed
 * @return double
 */
public double averageIR(double[] IR) {
    return Utils.mean(IR);
}

From source file:jjj.asap.sas.util.Calc.java

License:Open Source License

/**
 * @return an average of kappa scores.//from ww w . j ava2  s  .c  o m
 */
public static double kappa(double[] x) {

    double[] z = new double[x.length];
    for (int i = 0; i < x.length; i++) {
        z[i] = 0.5 * Math.log((1.0 + x[i]) / (1.0 - x[i]));
    }
    double a = Utils.mean(z);
    double e = Math.exp(2.0 * a);
    return (e - 1) / (e + 1);
}

From source file:kea.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*from www.j  a va2s  . com*/
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;
        while ((inst = m_KEAFilter.output()) != null) {
            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }
        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;
        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i]
                        .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i]
                                .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
}