com.openkm.kea.modelcreator.KEAModelBuilder.java Source code

Introduction

Here is the source code for com.openkm.kea.modelcreator.KEAModelBuilder.java
Source

/**
 *  OpenKM, Open Document Management System (http://www.openkm.com)
 *  Copyright (c) 2006-2010  Paco Avila & Josep Llort
 * 
 *  No bytes were intentionally harmed during the development of this application.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

package com.openkm.kea.modelcreator;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;

import com.openkm.kea.filter.KEAFilter;
import com.openkm.kea.stemmers.SremovalStemmer;
import com.openkm.kea.stemmers.Stemmer;
import com.openkm.kea.stopwords.Stopwords;
import com.openkm.kea.stopwords.StopwordsEnglish;

/**
 * Builds a keyphrase extraction model from the documents in a given
 * directory.  Assumes that the file names for the documents end with
 * ".txt".  Assumes that files containing corresponding
 * author-assigned keyphrases end with ".key". Optionally an encoding
 * for the documents/keyphrases can be defined (e.g. for Chinese
 * text).
 *
 * Valid options are:<p>
 *
 * -l "directory name"<br>
 * Specifies name of directory.<p>
 *
 * -m "model name"<br>
 * Specifies name of model.<p>
 *
 * -e "encoding"<br>
 * Specifies encoding.<p>
 * 
 * -v "vocabulary name" <br>
 * Specifies vocabulary name (e.g. agrovoc or none).<p>
 * 
 * -f "vocabulary format" <br>
 * Specifies vocabulary format (txt or skos).<p>
 *
 * -i "document language" <br>
 * Specifies document language (en, es, de, fr).<p>
 *
 * -d<br>
 * Turns debugging mode on.<p>
 *
 * -k<br>
 * Use keyphrase frequency statistic.<p>
 *
 * -r<br>
 * Use agrovoc relation as feature.<p>
 *
 * -p<br>
 * Disallow internal periods.<p>
 *
 * -x "length"<br>
 * Sets maximum phrase length (default: 3).<p>
 *
 * -y "length"<br>
 * Sets minimum phrase length (default: 1).<p>
 *
 * -o "number"<br>
 * The minimum number of times a phrase needs to occur (default: 2). <p>
 *
 * -s "name of class implementing list of stop words"<br>
 * Sets list of stop words to used (default: StopwordsEnglish).<p>
 *
 * -t "name of class implementing stemmer"<br>
 * Sets stemmer to use (default: IteratedLovinsStemmer). <p>
 *
 * -n<br>
 * Do not check for proper nouns. <p>
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
 */
public class KEAModelBuilder implements OptionHandler {

    private static Logger log = LoggerFactory.getLogger(KEAModelBuilder.class);

    /** Name of directory */
    String m_dirName = null;

    /** Name of model */
    String m_modelName = null;

    /** Vocabulary name */
    String m_vocabulary = null;

    /** Format of the vocabulary */
    String m_vocabularyFormat = null;

    /** Document language */
    String m_documentLanguage = "en";

    /** Encoding */
    String m_encoding = "default";

    /** Debugging mode? */
    boolean m_debug = false;

    /** Use keyphrase frequency attribute? */
    boolean m_useKFrequency = false;

    /** Disallow internal periods? */
    boolean m_disallowIPeriods = false;

    /** The maximum length of phrases */
    private int m_MaxPhraseLength = 5;

    /** The minimum length of phrases */
    private int m_MinPhraseLength = 1;

    /** The minimum number of occurences of a phrase */
    private int m_MinNumOccur = 2;

    /** The KEA filter object */
    KEAFilter m_KEAFilter = null;

    /** The stemmer to be used */
    private Stemmer m_Stemmer = new SremovalStemmer();

    /** The list of stop words to be used */
    private Stopwords m_Stopwords;

    /** Determines whether check for proper nouns is performed */
    private boolean m_CheckForProperNouns = true;

    /**
     * Get the M_CheckProperNouns value.
     * @return the M_CheckProperNouns value.
     */
    public boolean getCheckForProperNouns() {
        return m_CheckForProperNouns;
    }

    /**
     * Set the M_CheckProperNouns value.
     * @param newM_CheckProperNouns The new M_CheckProperNouns value.
     */
    public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
        this.m_CheckForProperNouns = newM_CheckProperNouns;
    }

    /**
     * Get the M_Stopwords value.
     * @return the M_Stopwords value.
     */
    public Stopwords getStopwords() {
        return m_Stopwords;
    }

    /**
     * Set the M_Stopwords value.
     * @param newM_Stopwords The new M_Stopwords value.
     */
    public void setStopwords(Stopwords newM_Stopwords) {
        this.m_Stopwords = newM_Stopwords;
    }

    /**
     * Get the Stemmer value.
     * @return the Stemmer value.
     */
    public Stemmer getStemmer() {
        return m_Stemmer;
    }

    /**
     * Set the Stemmer value.
     * @param newStemmer The new Stemmer value.
     */
    public void setStemmer(Stemmer newStemmer) {
        this.m_Stemmer = newStemmer;
    }

    /**
     * Get the value of MinNumOccur.
     *
     * @return Value of MinNumOccur.
     */
    public int getMinNumOccur() {
        return m_MinNumOccur;
    }

    /**
     * Set the value of MinNumOccur.
     *
     * @param newMinNumOccur Value to assign to MinNumOccur.
     */
    public void setMinNumOccur(int newMinNumOccur) {
        m_MinNumOccur = newMinNumOccur;
    }

    /**
     * Get the value of MaxPhraseLength.
     *
     * @return Value of MaxPhraseLength.
     */
    public int getMaxPhraseLength() {
        return m_MaxPhraseLength;
    }

    /**
     * Set the value of MaxPhraseLength.
     *
     * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
     */
    public void setMaxPhraseLength(int newMaxPhraseLength) {
        m_MaxPhraseLength = newMaxPhraseLength;
    }

    /**
     * Get the value of MinPhraseLength.
     *
     * @return Value of MinPhraseLength.
     */
    public int getMinPhraseLength() {
        return m_MinPhraseLength;
    }

    /**
     * Set the value of MinPhraseLength.
     *
     * @param newMinPhraseLength Value to assign to MinPhraseLength.
     */
    public void setMinPhraseLength(int newMinPhraseLength) {
        m_MinPhraseLength = newMinPhraseLength;
    }

    /**
     * Get the value of disallowIPeriods.
     *
     * @return Value of disallowIPeriods.
     */
    public boolean getDisallowIPeriods() {
        return m_disallowIPeriods;
    }

    /**
     * Set the value of disallowIPeriods.
     *
     * @param newdisallowIPeriods Value to assign to disallowIPeriods.
     */
    public void setDisallowIPeriods(boolean newdisallowIPeriods) {
        m_disallowIPeriods = newdisallowIPeriods;
    }

    /**
     * Get the value of useKFrequency.
     *
     * @return Value of useKFrequency.
     */
    public boolean getUseKFrequency() {
        return m_useKFrequency;
    }

    /**
     * Set the value of useKFrequency.
     *
     * @param newuseKFrequency Value to assign to useKFrequency.
     */
    public void setUseKFrequency(boolean newuseKFrequency) {
        m_useKFrequency = newuseKFrequency;
    }

    /**
     * Get the value of debug.
     *
     * @return Value of debug.
     */
    public boolean getDebug() {
        return m_debug;
    }

    /**
     * Set the value of debug.
     *
     * @param newdebug Value to assign to debug.
     */
    public void setDebug(boolean newdebug) {
        m_debug = newdebug;
    }

    /**
     * Get the value of encoding.
     *
     * @return Value of encoding.
     */
    public String getEncoding() {
        return m_encoding;
    }

    /**
     * Set the value of encoding.
     *
     * @param newencoding Value to assign to encoding.
     */
    public void setEncoding(String newencoding) {
        m_encoding = newencoding;
    }

    /**
     * Get the value of vocabulary name.
     *
     * @return Value of vocabulary name.
     */
    public String getVocabulary() {
        return m_vocabulary;
    }

    /**
     * Set the value of vocabulary name.
     *
     * @param newvocabulary Value to assign to vocabulary name.
     */
    public void setVocabulary(String newvocabulary) {
        m_vocabulary = newvocabulary;
    }

    /**
     * Get the value of document language.
     *
     * @return Value of document language.
     */
    public String getDocumentLanguage() {
        return m_documentLanguage;
    }

    /**
     * Set the value of document language.
     *
     * @param newdocumentLanguage Value to assign to document language.
     */
    public void setDocumentLanguage(String newdocumentLanguage) {
        m_documentLanguage = newdocumentLanguage;
    }

    /**
     * Get the value of vocabulary format.
     *
     * @return Value of vocabulary format.
     */
    public String getVocabularyFormat() {
        return m_vocabularyFormat;
    }

    /**
     * Set the value of vocabulary format.
     *
     * @param newvocabularyFormat Value to assign to vocabulary format.
     */
    public void setVocabularyFormat(String newvocabularyFormat) {
        m_vocabularyFormat = newvocabularyFormat;
    }

    /**
     * Get the value of modelName.
     *
     * @return Value of modelName.
     */
    public String getModelName() {
        return m_modelName;
    }

    /**
     * Set the value of modelName.
     *
     * @param newmodelName Value to assign to modelName.
     */
    public void setModelName(String newmodelName) {
        m_modelName = newmodelName;
    }

    /**
     * Get the value of dirName.
     *
     * @return Value of dirName.
     */
    public String getDirName() {
        return m_dirName;
    }

    /**
     * Set the value of dirName.
     *
     * @param newdirName Value to assign to dirName.
     */
    public void setDirName(String newdirName) {
        m_dirName = newdirName;
    }

    /**
     * Parses a given list of options controlling the behaviour of this object.
     * Valid options are:<p>
     *
     * -l "directory name" <br>
     * Specifies name of directory.<p>
     *
     * -m "model name" <br>
     * Specifies name of model.<p>
     *
     * -v "vocabulary name" <br>
     * Specifies vocabulary name.<p>
     * 
     * -f "vocabulary format" <br>
     * Specifies vocabulary format.<p>
     *    
     * -i "document language" <br>
     * Specifies document language.<p>
     * 
     * -e "encoding" <br>
     * Specifies encoding.<p>
     * 
     * -d<br>
     * Turns debugging mode on.<p>
     *
     * -k<br>
     * Use keyphrase frequency statistic.<p>
     *
     * -p<br>
     * Disallow internal periods. <p>
     *
     * -x "length"<br>
     * Sets maximum phrase length (default: 3).<p>
     *
     * -y "length"<br>
     * Sets minimum phrase length (default: 3).<p>
     *
     * -o "number"<br>
     * The minimum number of times a phrase needs to occur (default: 2). <p>
     *
     * -s "name of class implementing list of stop words"<br>
     * Sets list of stop words to used (default: StopwordsEnglish).<p>
     *
     * -t "name of class implementing stemmer"<br>
     * Sets stemmer to use (default: IteratedLovinsStemmer). <p>
     *
     * -n<br>
     * Do not check for proper nouns. <p>
     *
     * @param options the list of options as an array of strings
     * @exception Exception if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {
        String dirName = Utils.getOption('l', options);
        if (dirName.length() > 0) {
            setDirName(dirName);
        } else {
            setDirName(null);
            throw new Exception("Name of directory required argument.");
        }

        String modelName = Utils.getOption('m', options);
        if (modelName.length() > 0) {
            setModelName(modelName);
        } else {
            setModelName(null);
            throw new Exception("Name of model required argument.");
        }

        String vocabularyName = Utils.getOption('v', options);
        if (vocabularyName.length() > 0) {
            setVocabulary(vocabularyName);
        } else {
            setVocabulary(null);
            throw new Exception("Name of vocabulary required argument.");
        }

        String vocabularyFormat = Utils.getOption('f', options);

        if (!getVocabulary().equals("none")) {
            if (vocabularyFormat.length() > 0) {
                if (vocabularyFormat.equals("skos") || vocabularyFormat.equals("text")) {
                    setVocabularyFormat(vocabularyFormat);
                } else {
                    throw new Exception(
                            "Unsupported format of vocabulary. It should be either \"skos\" or \"text\".");
                }
            } else {
                setVocabularyFormat(null);
                throw new Exception(
                        "If a controlled vocabulary is used, format of vocabulary required argument (skos or text).");
            }
        } else {
            setVocabularyFormat(null);
        }

        String encoding = Utils.getOption('e', options);
        if (encoding.length() > 0) {
            setEncoding(encoding);
        } else {
            setEncoding("default");
        }

        String documentLanguage = Utils.getOption('i', options);
        if (documentLanguage.length() > 0) {
            setDocumentLanguage(documentLanguage);
        } else {
            setDocumentLanguage("en");
        }

        String maxPhraseLengthString = Utils.getOption('x', options);
        if (maxPhraseLengthString.length() > 0) {
            setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
        } else {
            setMaxPhraseLength(5);
        }
        String minPhraseLengthString = Utils.getOption('y', options);
        if (minPhraseLengthString.length() > 0) {
            setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
        } else {
            setMinPhraseLength(1);
        }
        String minNumOccurString = Utils.getOption('o', options);
        if (minNumOccurString.length() > 0) {
            setMinNumOccur(Integer.parseInt(minNumOccurString));
        } else {
            setMinNumOccur(2);
        }

        String stopwordsString = Utils.getOption('s', options);
        if (stopwordsString.length() > 0) {
            stopwordsString = " com.openkm.kea.stopwords.".concat(stopwordsString);
            setStopwords((Stopwords) Class.forName(stopwordsString).newInstance());
        }

        String stemmerString = Utils.getOption('t', options);
        if (stemmerString.length() > 0) {
            stemmerString = " com.openkm.kea.stemmers.".concat(stemmerString);
            setStemmer((Stemmer) Class.forName(stemmerString).newInstance());
        }
        setDebug(Utils.getFlag('d', options));
        setUseKFrequency(Utils.getFlag('k', options));
        setDisallowIPeriods(Utils.getFlag('p', options));
        setCheckForProperNouns(!Utils.getFlag('n', options));
        Utils.checkForRemainingOptions(options);
    }

    /**
     * Gets the current option settings.
     *
     * @return an array of strings suitable for passing to setOptions
     */
    public String[] getOptions() {
        String[] options = new String[26];
        int current = 0;

        options[current++] = "-l";
        options[current++] = "" + (getDirName());
        options[current++] = "-m";
        options[current++] = "" + (getModelName());
        options[current++] = "-v";
        options[current++] = "" + (getVocabulary());
        options[current++] = "-f";
        options[current++] = "" + (getVocabularyFormat());
        options[current++] = "-e";
        options[current++] = "" + (getEncoding());
        options[current++] = "-i";
        options[current++] = "" + (getDocumentLanguage());

        if (getUseKFrequency()) {
            options[current++] = "-k";
        }
        if (getDebug()) {
            options[current++] = "-d";
        }
        if (getDisallowIPeriods()) {
            options[current++] = "-p";
        }
        options[current++] = "-x";
        options[current++] = "" + (getMaxPhraseLength());
        options[current++] = "-y";
        options[current++] = "" + (getMinPhraseLength());
        options[current++] = "-o";
        options[current++] = "" + (getMinNumOccur());
        options[current++] = "-s";
        options[current++] = "" + (getStopwords().getClass().getName());
        options[current++] = "-t";
        options[current++] = "" + (getStemmer().getClass().getName());
        if (getCheckForProperNouns()) {
            options[current++] = "-n";
        }

        while (current < options.length) {
            options[current++] = "";
        }
        return options;
    }

    /**
     * Returns an enumeration describing the available options.
     *
     * @return an enumeration of all the available options
     */
    public Enumeration<Option> listOptions() {
        Vector<Option> newVector = new Vector<Option>(14);

        newVector.addElement(new Option("\tSpecifies name of directory.", "l", 1, "-l <directory name>"));
        newVector.addElement(new Option("\tSpecifies name of model.", "m", 1, "-m <model name>"));
        newVector.addElement(new Option("\tSpecifies vocabulary name.", "v", 1, "-v <vocabulary name>"));
        newVector.addElement(new Option("\tSpecifies vocabulary format (text or skos or none).", "f", 1,
                "-f <vocabulary format>"));
        newVector.addElement(new Option("\tSpecifies document language (en (default), es, de, fr).", "i", 1,
                "-i <document language>"));
        newVector.addElement(new Option("\tSpecifies encoding.", "e", 1, "-e <encoding>"));
        newVector.addElement(new Option("\tTurns debugging mode on.", "d", 0, "-d"));
        newVector.addElement(new Option("\tUse keyphrase frequency statistic.", "k", 0, "-k"));
        newVector.addElement(new Option("\tDisallow internal periods.", "p", 0, "-p"));
        newVector.addElement(new Option("\tSets the maximum phrase length (default: 5).", "x", 1, "-x <length>"));
        newVector.addElement(new Option("\tSets the minimum phrase length (default: 1).", "y", 1, "-y <length>"));
        newVector.addElement(new Option("\tSet the minimum number of occurences (default: 2).", "o", 1, "-o"));
        newVector.addElement(new Option("\tSets the list of stopwords to use (default: StopwordsEnglish).", "s", 1,
                "-s <name of stopwords class>"));
        newVector.addElement(new Option("\tSet the stemmer to use (default: SremovalStemmer).", "t", 1,
                "-t <name of stemmer class>"));
        newVector.addElement(new Option("\tDo not check for proper nouns.", "n", 0, "-n"));

        return newVector.elements();
    }

    /**
     * Collects the stems of the file names.
     */
    public Hashtable<String, Double> collectStems() throws Exception {
        Hashtable<String, Double> stems = new Hashtable<String, Double>();

        try {
            File dir = new File(m_dirName);
            String[] files = dir.list();
            for (int i = 0; i < files.length; i++) {
                if (files[i].endsWith(".key") || files[i].endsWith(".txt")) {
                    String stem = files[i].substring(0, files[i].length() - 4);
                    if (!stems.containsKey(stem)) {
                        stems.put(stem, new Double(0));
                    }
                }
            }
        } catch (Exception e) {
            throw new Exception("Problem opening directory " + m_dirName);
        }

        return stems;
    }

    /**
     * Builds the model from the files
     */
    public void buildModel(Hashtable<String, Double> stems, Stopwords stopwords) throws Exception {
        // Check whether there is actually any data
        if (stems.size() == 0) {
            throw new Exception("Couldn't find any data!");
        }

        FastVector atts = new FastVector(2);
        atts.addElement(new Attribute("doc", (FastVector) null));
        atts.addElement(new Attribute("keyphrases", (FastVector) null));
        Instances data = new Instances("keyphrase_training_data", atts, 0);

        // Build model
        m_KEAFilter = new KEAFilter(stopwords);

        m_KEAFilter.setDebug(m_debug);
        m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
        m_KEAFilter.setKFused(getUseKFrequency());

        m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
        m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
        m_KEAFilter.setMinNumOccur(getMinNumOccur());
        m_KEAFilter.setStemmer(getStemmer());
        m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
        m_KEAFilter.setVocabulary(getVocabulary());
        m_KEAFilter.setVocabularyFormat(getVocabularyFormat());
        m_KEAFilter.setStopwords(getStopwords());
        m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
        m_KEAFilter.setInputFormat(data);

        if (getVocabulary().equals("none")) {
            m_KEAFilter.m_NODEfeature = false;
        } else {
            m_KEAFilter.loadThesaurus(getStemmer(), getStopwords());
        }
        m_KEAFilter.setNumFeature();

        log.info("-- Reading the Documents... ");

        Enumeration<String> elem = stems.keys();
        while (elem.hasMoreElements()) {
            String str = elem.nextElement();

            double[] newInst = new double[2];
            try {
                File txt = new File(m_dirName + "/" + str + ".txt");
                InputStreamReader is;
                if (!m_encoding.equals("default")) {
                    is = new InputStreamReader(new FileInputStream(txt), m_encoding);
                } else {
                    is = new InputStreamReader(new FileInputStream(txt));
                }
                StringBuffer txtStr = new StringBuffer();
                int c;
                while ((c = is.read()) != -1) {
                    txtStr.append((char) c);
                }
                is.close();
                newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
            } catch (Exception e) {
                log.error("Can't find document for stem " + str + ".");
                newInst[0] = Instance.missingValue();
            }
            try {
                File key = new File(m_dirName + "/" + str + ".key");
                InputStreamReader is;
                if (!m_encoding.equals("default")) {
                    is = new InputStreamReader(new FileInputStream(key), m_encoding);
                } else {
                    is = new InputStreamReader(new FileInputStream(key));
                }
                StringBuffer keyStr = new StringBuffer();
                int c;
                while ((c = is.read()) != -1) {
                    keyStr.append((char) c);
                }
                newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
            } catch (Exception e) {
                log.error("Can't find keyphrases for stem " + str + ".");
                newInst[1] = Instance.missingValue();
            }
            data.add(new Instance(1.0, newInst));
            m_KEAFilter.input(data.instance(0));
            data = data.stringFreeStructure();
        }
        m_KEAFilter.batchFinished();

        while ((m_KEAFilter.output()) != null) {
        }
        ;
    }

    /** 
     * Saves the extraction model to the file.
     */
    public void saveModel() throws Exception {

        // remove the m_Vocabulary (which is now not static)
        // we don't want it
        m_KEAFilter.clearVocabulary();

        try {
            BufferedOutputStream bufferedOut = new BufferedOutputStream(new FileOutputStream(m_modelName));
            ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
            out.writeObject(m_KEAFilter);
            out.flush();
            out.close();
        } catch (IOException e) {
            log.error("I/O error writing model objects:", e);
        } catch (Throwable e) {
            log.error("Unexpected error writing model objects:", e);
        }
    }

    /**
     * The main method.  
     */
    public static void main(String[] ops) {
        KEAModelBuilder kmb = new KEAModelBuilder();

        try {
            kmb.setOptions(ops);
            log.info("Building model with options: ");
            String[] optionSettings = kmb.getOptions();
            for (int i = 0; i < optionSettings.length; i++) {
                log.info(optionSettings[i] + " ");
            }
            kmb.buildModel(kmb.collectStems(), new StopwordsEnglish());
            kmb.saveModel();
        } catch (Exception e) {
            e.printStackTrace();
            log.error(e.getMessage());
            log.error("\nOptions:\n");
            Enumeration<Option> en = kmb.listOptions();
            while (en.hasMoreElements()) {
                Option option = en.nextElement();
                log.error(option.synopsis());
                log.error(option.description());
            }
        }
    }
}