gate.creole.kea.Kea.java Source code

Introduction

Here is the source code for gate.creole.kea.Kea.java
Source

/*
 *    Kea.java
 * 
 *    Copyright (c) 1998-2005, The University of Sheffield.
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Valentin Tablan, 3/Feb/2005
 *
 *  $Id: Kea.java 6558 2005-02-03 17:28:52 +0000 (Thu, 03 Feb 2005) valyt $
 */

package gate.creole.kea;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.gui.ActionsPublisher;
import gate.gui.MainFrame;
import gate.util.Err;
import gate.util.InvalidOffsetException;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import javax.swing.Action;
import javax.swing.JFileChooser;
import javax.swing.JOptionPane;

import kea.KEAFilter;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;

/**
 * This is wrapper for using the KEA Keyphrase extractor
 * (<a href="http://www.nzdl.org/Kea/">http://www.nzdl.org/Kea/</a>)
 * within the GATE Language Engineering
 * architecture (<a href="http://gate.ac.uk">http://gate.ac.uk</a>).
 * It exposes KEA as a GATE Processing Resource that has two functioning modes:
 * <UL>
 *  <LI>Training mode: when keyphrases (marked as annotations on documents) are
 *  collected and a model is built.
 *  <LI>Application mode: when a built model is applied on documents to the end
 *  of extracting keyphrases.
 * </UL>
 */
@SuppressWarnings("serial")
public class Kea extends AbstractLanguageAnalyser implements ActionsPublisher {

    private static final long serialVersionUID = 8297240873486868105L;

    /**
     * Anonymous constructor, required by GATE. Does nothing.
     */
    public Kea() {
    }

    /**
     * Gets the list of GUI actions available from this PR. Currently Load and
     * Save model.
     * @return
     */
    public List<Action> getActions() {
        return actions;
    }

    /**
     * Executes this PR. Depeding on the state of the {@link #trainingMode} switch
     * it will either train a model or apply it over the documents.<br>
     * Trainig consists of collecting keyphrase annotations from the input
     * annotation set of the input documents. The first time a trained model is
     * required (either application mode has started or the model is being saved)
     * the actual model ({link @ #keaModel}) will be constructed.<br>
     * The application mode consists of using a trained model to generate
     * keyphrase annotations on the output annotation set of the input documents.
     * @throws ExecutionException
     */
    public void execute() throws gate.creole.ExecutionException {
        //reinitialise the KEA filter if already trained
        if (trainingMode.booleanValue() && trainingFinished) {
            //retrainig started with a used model
            System.out.println("Reinitialising KEA model...");
            try {
                initModel();
            } catch (Exception e) {
                throw new ExecutionException(e);
            }
        }

        //get the clear text from the document.
        String text = document.getContent().toString();
        //generate the first attribute: the text
        //this will be used for both training and application modes.
        double[] newInst = new double[2];
        newInst[0] = data.attribute(0).addStringValue(text);

        if (trainingMode.booleanValue()) {
            //training mode -> we need to collect the keyphrases
            //find the input annotation set.
            AnnotationSet annSet = inputAS == null || inputAS.length() == 0 ? document.getAnnotations()
                    : document.getAnnotations(inputAS);
            //extract the keyphrase annotations
            AnnotationSet kpSet = annSet.get(keyphraseAnnotationType);
            if (kpSet != null && kpSet.size() > 0) {
                //use a set to avoid repetitions
                Set<String> keyPhrases = new HashSet<String>();

                Iterator<Annotation> keyPhraseIter = kpSet.iterator();
                //initialise the string for the second attribute
                String keyPhrasesStr = "";
                while (keyPhraseIter.hasNext()) {
                    //get one keyphrase annotation
                    Annotation aKeyPhrase = keyPhraseIter.next();
                    try {
                        //get the string for the keyphrase annotation
                        String keyPhraseStr = document.getContent()
                                .getContent(aKeyPhrase.getStartNode().getOffset(),
                                        aKeyPhrase.getEndNode().getOffset())
                                .toString();
                        //if the keyphrase has not been seen before add to the string for
                        //the second attribute
                        if (keyPhrases.add(keyPhraseStr))
                            keyPhrasesStr += keyPhrasesStr.length() > 0 ? "\n" + keyPhraseStr : keyPhraseStr;
                    } catch (InvalidOffsetException ioe) {
                        throw new ExecutionException(ioe);
                    }
                }
                //all the keyphrases have been enumerated -> create the second attribute
                newInst[1] = data.attribute(1).addStringValue(keyPhrasesStr);
            } else {
                //no keyphrase annotations
                newInst[1] = Instance.missingValue();
                System.out.println("No keyphrases in document: " + document.getName());
            }
            //add the new instance to the dataset
            data.add(new Instance(1.0, newInst));
            try {
                keaFilter.input(data.instance(0));
            } catch (Exception e) {
                throw new ExecutionException(e);
            }
            data = data.stringFreeStructure();
        } else {
            //application mode -> we need to generate keyphrases
            //build the model if not already done
            if (!trainingFinished)
                finishTraining();

            newInst[1] = Instance.missingValue();
            data.add(new Instance(1.0, newInst));
            try {
                keaFilter.input(data.instance(0));
            } catch (Exception e) {
                throw new ExecutionException(e);
            }
            data = data.stringFreeStructure();
            //extract the output from the model
            Instance[] topRankedInstances = new Instance[phrasesToExtract.intValue()];
            Instance inst;
            while ((inst = keaFilter.output()) != null) {
                int index = (int) inst.value(keaFilter.getRankIndex()) - 1;
                if (index < phrasesToExtract.intValue()) {
                    topRankedInstances[index] = inst;
                }
            }
            //annotate the document with the results -> create a list with all the
            //keyphrases found by KEA
            List<String> phrases = new ArrayList<String>();
            for (int i = 0; i < topRankedInstances.length; i++) {
                if (topRankedInstances[i] != null) {
                    phrases.add(topRankedInstances[i].stringValue(keaFilter.getUnstemmedPhraseIndex()));
                }
            }
            try {
                //add the actiul annotations on the document
                annotateKeyPhrases(phrases);
            } catch (Exception e) {
                throw new ExecutionException(e);
            }
        } //application mode
    }//execute

    /**
     * Annnotates the document with all the occurences of keyphrases from a List.
     * Uses the java.util.regex package to search for ocurences of keyphrases.
     * @param phrases the list of keyphrases.
     * @throws Exception
     */
    protected void annotateKeyPhrases(List<String> phrases) throws Exception {
        if (phrases == null || phrases.isEmpty())
            return;
        //create a pattern
        String patternStr = "";
        Iterator<String> phraseIter = phrases.iterator();
        while (phraseIter.hasNext()) {
            String phrase = phraseIter.next();
            patternStr += patternStr.length() == 0 ? "\\Q" + phrase + "\\E" : "|\\Q" + phrase + "\\E";
        }

        Pattern pattern = Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

        Matcher matcher = pattern.matcher(document.getContent().toString());
        //find the output annotation set
        AnnotationSet outputSet = outputAS == null || outputAS.length() == 0 ? document.getAnnotations()
                : document.getAnnotations(outputAS);
        while (matcher.find()) {
            int start = matcher.start();
            int end = matcher.end();
            //add the new annotation
            outputSet.add(new Long(start), new Long(end), keyphraseAnnotationType, Factory.newFeatureMap());
        }
        document.getFeatures().put("KEA matched keyphrases", phrases);
    }//protected void annotateKeyPhrases(List phrases)

    /**
     * Action used to save a trained model. If the model is not built yet it will
     * be built before being saved.
     */
    protected class SaveModelAction extends javax.swing.AbstractAction {
        public SaveModelAction() {
            super("Save model");
            putValue(SHORT_DESCRIPTION, "Saves the KEA model to a file");
        }

        public void actionPerformed(java.awt.event.ActionEvent evt) {
            //we need to use a new thread to avoid blocking the GUI
            Runnable runnable = new Runnable() {
                public void run() {
                    //get the file to save to
                    JFileChooser fileChooser = MainFrame.getFileChooser();
                    fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter());
                    fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
                    fileChooser.setMultiSelectionEnabled(false);
                    if (fileChooser.showSaveDialog(null) == JFileChooser.APPROVE_OPTION) {
                        File file = fileChooser.getSelectedFile();
                        ObjectOutputStream oos = null;
                        try {
                            MainFrame.lockGUI("Saving KEA model...");
                            if (!trainingFinished)
                                finishTraining();
                            //zip and save the model
                            oos = new ObjectOutputStream(
                                    new GZIPOutputStream(new FileOutputStream(file.getCanonicalPath(), false)));
                            oos.writeObject(keaFilter);
                            oos.flush();
                            oos.close();
                            oos = null;
                        } catch (Exception e) {
                            MainFrame.unlockGUI();
                            JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + e.toString(),
                                    "Gate", JOptionPane.ERROR_MESSAGE);
                            e.printStackTrace(Err.getPrintWriter());
                            if (oos != null)
                                try {
                                    oos.close();
                                } catch (IOException ioe) {
                                    JOptionPane.showMessageDialog(MainFrame.getInstance(),
                                            "Error!\n" + ioe.toString(), "Gate", JOptionPane.ERROR_MESSAGE);
                                    ioe.printStackTrace(Err.getPrintWriter());
                                }
                        } finally {
                            MainFrame.unlockGUI();
                        }
                    }
                }
            };
            Thread thread = new Thread(runnable, "ModelSaver(serialisation)");
            thread.setPriority(Thread.MIN_PRIORITY);
            thread.start();
        }
    }

    /**
     * Action for loading a saved model. Once loaded the model wil be marked as
     * trained.
     */
    protected class LoadModelAction extends javax.swing.AbstractAction {
        public LoadModelAction() {
            super("Load model");
            putValue(SHORT_DESCRIPTION, "Loads a KEA model from a file");
        }

        public void actionPerformed(java.awt.event.ActionEvent evt) {
            Runnable runnable = new Runnable() {
                public void run() {
                    //get the file to load from.
                    JFileChooser fileChooser = MainFrame.getFileChooser();
                    fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter());
                    fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
                    fileChooser.setMultiSelectionEnabled(false);
                    if (fileChooser.showOpenDialog(null) == JFileChooser.APPROVE_OPTION) {
                        File file = fileChooser.getSelectedFile();
                        ObjectInputStream ois = null;
                        try {
                            MainFrame.lockGUI("Loading model...");
                            //unzip and load the model
                            ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(file)));
                            keaFilter = (KEAFilter) ois.readObject();
                            ois.close();
                            ois = null;
                            //mark the model as trained.
                            trainingFinished = true;
                        } catch (Exception e) {
                            MainFrame.unlockGUI();
                            JOptionPane.showMessageDialog(MainFrame.getInstance(), "Error!\n" + e.toString(),
                                    "Gate", JOptionPane.ERROR_MESSAGE);
                            e.printStackTrace(Err.getPrintWriter());
                            if (ois != null)
                                try {
                                    ois.close();
                                } catch (IOException ioe) {
                                    JOptionPane.showMessageDialog(MainFrame.getInstance(),
                                            "Error!\n" + ioe.toString(), "Gate", JOptionPane.ERROR_MESSAGE);
                                    ioe.printStackTrace(Err.getPrintWriter());
                                }
                        } finally {
                            MainFrame.unlockGUI();
                        }
                    }
                }
            };
            Thread thread = new Thread(runnable, "ModelLoader(serialisation)");
            thread.setPriority(Thread.MIN_PRIORITY);
            thread.start();
        }
    }

    /**
     * Sets the annotation type to be used for keyphrases.
     * @param keyphraseAnnotationType
     */
    public void setKeyphraseAnnotationType(String keyphraseAnnotationType) {
        this.keyphraseAnnotationType = keyphraseAnnotationType;
    }

    /**
     * Sets the annotation type to be used for keyphrases.
     * @return
     */
    public String getKeyphraseAnnotationType() {
        return keyphraseAnnotationType;
    }

    /**
     * Sets the name for the input annotation set.
     * @param inputAS
     */
    public void setInputAS(String inputAS) {
        this.inputAS = inputAS;
    }

    /**
     * Gets the name for the input annotation set.
     * @return
     */
    public String getInputAS() {
        return inputAS;
    }

    /**
     * Sets the name for the output annotation set.
     * @param outputAS
     */
    public void setOutputAS(String outputAS) {
        this.outputAS = outputAS;
    }

    /**
     * Gets the name for the output annotation set.
     * @return
     */
    public String getOutputAS() {
        return outputAS;
    }

    /**
     * Initialises this KEA Processing Resource.
     * @return
     * @throws ResourceInstantiationException
     */
    public Resource init() throws gate.creole.ResourceInstantiationException {
        System.out.println("\nThis is KEA (automatic keyphrase extraction)");
        System.out.println("Details at http://www.nzdl.org/Kea/\n");
        super.init();
        try {
            initModel();
        } catch (Exception e) {
            throw new ResourceInstantiationException(e);
        }
        actions = new ArrayList<Action>();
        actions.add(new SaveModelAction());
        actions.add(new LoadModelAction());
        return this;
    }

    /**
     * Initialises the KEA model.
     * @throws Exception
     */
    protected void initModel() throws Exception {
        keaFilter = new KEAFilter();

        atts = new FastVector(2);
        atts.addElement(new Attribute("doc", (FastVector) null));
        atts.addElement(new Attribute("keyphrases", (FastVector) null));
        data = new Instances("keyphrase_training_data", atts, 0);

        keaFilter.setDisallowInternalPeriods(getDisallowInternalPeriods().booleanValue());
        keaFilter.setKFused(getUseKFrequency().booleanValue());
        keaFilter.setMaxPhraseLength(getMaxPhraseLength().intValue());
        keaFilter.setMinPhraseLength(getMinPhraseLength().intValue());
        keaFilter.setMinNumOccur(getMinNumOccur().intValue());
        keaFilter.setInputFormat(data);
        trainingFinished = false;
    }

    /**
     * Stops the training phase and builds the actual model.
     * @throws ExecutionException
     */
    protected void finishTraining() throws ExecutionException {
        if (trainingFinished)
            return;
        try {
            keaFilter.batchFinished();
        } catch (Exception e) {
            throw new ExecutionException(e);
        }
        // Get rid of instances in filter
        @SuppressWarnings("unused")
        Instance dummy;
        while ((dummy = keaFilter.output()) != null) {
        }
        ;
        trainingFinished = true;
    }

    public void setMaxPhraseLength(Integer maxPhraseLength) {
        this.maxPhraseLength = maxPhraseLength;
    }

    public Integer getMaxPhraseLength() {
        return maxPhraseLength;
    }

    public void setMinPhraseLength(Integer minPhraseLength) {
        this.minPhraseLength = minPhraseLength;
    }

    public Integer getMinPhraseLength() {
        return minPhraseLength;
    }

    public void setDisallowInternalPeriods(Boolean dissallowInternalPeriods) {
        this.disallowInternalPeriods = dissallowInternalPeriods;
    }

    public Boolean getDisallowInternalPeriods() {
        return disallowInternalPeriods;
    }

    public void setUseKFrequency(Boolean useKFrequency) {
        this.useKFrequency = useKFrequency;
    }

    public Boolean getUseKFrequency() {
        return useKFrequency;
    }

    public void setMinNumOccur(Integer minNumOccur) {
        this.minNumOccur = minNumOccur;
    }

    public Integer getMinNumOccur() {
        return minNumOccur;
    }

    public Boolean getTrainingMode() {
        return trainingMode;
    }

    public void setTrainingMode(Boolean trainingMode) {
        this.trainingMode = trainingMode;
    }

    public void setPhrasesToExtract(Integer phrasesToExtract) {
        this.phrasesToExtract = phrasesToExtract;
    }

    public Integer getPhrasesToExtract() {
        return phrasesToExtract;
    }

    /**
     * If <tt>true</tt> then the PR is in training mode and will collect
     * keyphrases from the input documents.
     * If <tt>false</tt> then the PR is in application mode and will generate
     * keyphrases on the input documents.
     */
    private Boolean trainingMode;

    /**
     * The annotation type used for the keyphrase annotations.
     */
    private String keyphraseAnnotationType;

    /**
     * The name of the input annotation set.
     */
    private String inputAS;

    /**
     * The name for the output annotation set.
     */
    private String outputAS;

    /**
     * The maximum length for a keyphrase (default 3).
     */
    private Integer maxPhraseLength;

    /**
     * The minimum length for a keyphrase (default 1).
     */
    private Integer minPhraseLength;

    /**
     * Should periods be disallowed inside keyphrases?
     */
    private Boolean disallowInternalPeriods;

    /**
     * Use keyphrase frequency statistic.
     */
    private Boolean useKFrequency;

    /**
     * The minimum number of times a phrase needs to occur (default: 2).
     */
    private Integer minNumOccur;

    /**
     * How many keyphrases should be extracted for each input document?
     */
    private Integer phrasesToExtract;

    /**
     * This flag is used to determine whether the model has been constructed or
     * not. During training mode the training data is simply collected and this
     * flag is set to <tt>false</tt>. The first time when the traied model is
     * required (which could be either the first time the application mode is
     * started or when the model is being saved) the model is built from the
     * collected instances and this flag is set to <tt>true</tt>.<br>
     * If this flag is found to be <tt>true</tt> during training phase (i.e.
     * there is an attempt to train an already triend model) then the current
     * model will be discarded and a new one will be created. The traininig will
     * be performed using the newly created model.
     */
    protected boolean trainingFinished;

    /**
     * The KEA filter object which incorporates the actual model.
     */
    protected KEAFilter keaFilter = null;

    /**
     * Data structure used internally to define the dataset.
     */
    protected FastVector atts;

    /**
     * The dataset.
     */
    protected Instances data;

    /**
     * The list of GUI actions available from this PR on popup menus.
     */
    protected List<Action> actions;
}