Example usage for weka.core Utils variance

List of usage examples for weka.core Utils variance

Introduction

In this page you can find the example usage for weka.core Utils variance.

Prototype

public staticdouble variance(double[] vector) 

Source Link

Document

Computes the variance for an array of doubles.

Usage

From source file:PrincipalComponents.java

License:Open Source License

private void fillCovariance() throws Exception {
    // first store the means
    m_means = new double[m_trainInstances.numAttributes()];
    m_stdDevs = new double[m_trainInstances.numAttributes()];
    for (int i = 0; i < m_trainInstances.numAttributes(); i++) {
        m_means[i] = m_trainInstances.meanOrMode(i);
        m_stdDevs[i] = Math.sqrt(Utils.variance(m_trainInstances.attributeToDoubleArray(i)));
    }/*w  w  w. j  a  v a  2 s .  c  o  m*/

    // just center the data or standardize it?
    if (m_center) {
        m_centerFilter = new Center();
        m_centerFilter.setInputFormat(m_trainInstances);
        m_trainInstances = Filter.useFilter(m_trainInstances, m_centerFilter);
    } else {
        m_standardizeFilter = new Standardize();
        m_standardizeFilter.setInputFormat(m_trainInstances);
        m_trainInstances = Filter.useFilter(m_trainInstances, m_standardizeFilter);
    }

    // now compute the covariance matrix
    m_correlation = new UpperSymmDenseMatrix(m_numAttribs);
    for (int i = 0; i < m_numAttribs; i++) {
        for (int j = i; j < m_numAttribs; j++) {

            double cov = 0;
            for (Instance inst : m_trainInstances) {
                cov += inst.value(i) * inst.value(j);
            }

            cov /= m_trainInstances.numInstances() - 1;
            m_correlation.set(i, j, cov);
        }
    }
}

From source file:AtomUtils.java

License:Apache License

static public double[] getStandardDeviation(ArrayList<ArrayList<Double>> lists) {
    int numAtts = lists.size();
    assert (numAtts > 0);
    int numInsts = lists.get(0).size();
    assert (numInsts > 0);
    double[] att1 = new double[numInsts];
    double[] std = new double[numAtts];
    for (int i = 0; i < numAtts; i++) {
        for (int j = 0; j < numInsts; j++) {
            att1[j] = lists.get(i).get(j);
        }//from w w w.  ja  v a2 s.c  o m
        std[i] = Math.sqrt(Utils.variance(att1));
    }
    return std;
}

From source file:PCADetector.java

License:Apache License

public double[] getNormalSTD() {
    int numInsts = m_normalPts.size();
    assert (numInsts > 0);
    double[] att1 = new double[numInsts];
    double[] std = new double[m_nDims];
    for (int i = 0; i < m_nDims; i++) {
        for (int j = 0; j < numInsts; j++) {
            att1[j] = m_oriDataMatrix.get(i).get(m_normalPts.get(j));
        }//from   w ww  .j a v a2  s. c  o  m
        std[i] = Math.sqrt(Utils.variance(att1));
    }
    return std;
}

From source file:PCADetector.java

License:Apache License

public double[] getStandardDeviation(Instances Matrix) {
    int numAtts = Matrix.numAttributes();
    int numInsts = Matrix.numInstances();
    double[] att1 = new double[numInsts];
    double[] std = new double[numAtts];
    for (int i = 0; i < numAtts; i++) {
        for (int j = 0; j < numInsts; j++) {
            att1[j] = Matrix.instance(j).value(i);
        }//from   ww  w  . jav a2 s . c  o m
        std[i] = Math.sqrt(Utils.variance(att1));
    }
    return std;
}

From source file:com.github.polarisation.kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files// www.j  a  v  a 2s.  c o m
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.err.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.err.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//  w  w  w . j a v  a2 s  .  co  m
 */
public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception {
    Vector<Double> stats = new Vector<Double>();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    log.info("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration<String> elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                log.debug("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                log.debug("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            log.debug("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            log.debug("-- Keyphrases and feature values:");
        }

        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                // My addition: to exclude low ranking phrases
                double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex());

                if (rank >= 0.00) {
                    if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                        numExtracted += 1.0;
                    }
                    if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                        numCorrect += 1.0;
                    }
                    if (printer != null) {
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                        if (m_AdditionalInfo) {
                            printer.print("\t");
                            printer.print(
                                    topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                            printer.print("\t");
                            printer.print(Utils.doubleToString(
                                    topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                        }
                        printer.println();
                    }
                    if (m_debug) {
                        log.debug("" + topRankedInstances[i]);
                    }
                }
            }
        }

        if (numExtracted > 0) {
            if (m_debug) {
                log.debug("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }

        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }

    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2)
            + " +/- " + Utils.doubleToString(stdDev, 2));
    log.info("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:data.generation.target.utils.PrincipalComponents.java

License:Open Source License

/**
 * Fill the correlation matrix/*  w  w  w.  j  av  a 2s  . c om*/
 */
private void fillCorrelation() throws Exception {
    m_correlation = new double[m_numAttribs][m_numAttribs];
    double[] att1 = new double[m_numInstances];
    double[] att2 = new double[m_numInstances];
    double corr;

    for (int i = 0; i < m_numAttribs; i++) {
        for (int j = 0; j < m_numAttribs; j++) {
            for (int k = 0; k < m_numInstances; k++) {
                att1[k] = m_trainInstances.instance(k).value(i);
                att2[k] = m_trainInstances.instance(k).value(j);
            }
            if (i == j) {
                m_correlation[i][j] = 1.0;
                // store the standard deviation
                m_stdDevs[i] = Math.sqrt(Utils.variance(att1));
            } else {
                corr = Utils.correlation(att1, att2, m_numInstances);
                m_correlation[i][j] = corr;
                m_correlation[j][i] = corr;
            }
        }
    }

    // now standardize the input data
    m_standardizeFilter = new Standardize();
    m_standardizeFilter.setInputFormat(m_trainInstances);
    m_trainInstances = Filter.useFilter(m_trainInstances, m_standardizeFilter);
}

From source file:data.statistics.MLStatistics.java

License:Open Source License

/**
 * Computes the variance of any IR vector.
 * /*w  w w .ja v a 2s .com*/
 * @param IR
 *            An IR vector previously computed.
 * @return double.
 */
public double varianceIR(double[] IR) {
    return Utils.variance(IR);
}

From source file:kea.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*from  ww  w.  j av a  2  s.  co m*/
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;
        while ((inst = m_KEAFilter.output()) != null) {
            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }
        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;
        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i]
                        .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i]
                                .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
}

From source file:kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//from  ww w.  j  a  va 2 s  . co  m
 */
public synchronized void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    this.m_KEAFilter.setNumPhrases(m_numPhrases);
    this.m_KEAFilter.setVocabulary(m_vocabulary);
    this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    this.m_KEAFilter.setStemmer(m_Stemmer);
    this.m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        this.m_KEAFilter.m_NODEfeature = false;
    } else {
        // Know thesaurus is loaded in the constructor
        //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (this.m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.out.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            is.close();

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        this.m_KEAFilter.input(data.instance(0), vocabulary);

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = this.m_KEAFilter.output()) != null) {

            int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(
                            topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(
                                topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.out.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.out.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}