pt.ua.tm.gimli.writer.JNLPBAWriter.java Source code

Introduction

Here is the source code for pt.ua.tm.gimli.writer.JNLPBAWriter.java
Source

/*
 * Gimli - High-performance and multi-corpus recognition of biomedical
 * entity names
 *
 * Copyright (C) 2011 David Campos, Universidade de Aveiro, Instituto de
 * Engenharia Electrnica e Telemtica de Aveiro
 *
 * Gimli is licensed under the Creative Commons
 * Attribution-NonCommercial-ShareAlike 3.0 Unported License. To view a copy of
 * this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/.
 *
 * Gimli is a free software, you are free to copy, distribute, change and
 * transmit it. However, you may not use Gimli for commercial purposes.
 *
 * Gimli is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.
 *
 */
package pt.ua.tm.gimli.writer;

import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.ua.tm.gimli.annotator.Annotator;
import pt.ua.tm.gimli.corpus.Annotation;
import pt.ua.tm.gimli.corpus.AnnotationComparator;
import pt.ua.tm.gimli.corpus.Corpus;
import pt.ua.tm.gimli.config.Constants.Parsing;
import pt.ua.tm.gimli.config.Constants.LabelTag;
import pt.ua.tm.gimli.config.Constants.EntityType;
import pt.ua.tm.gimli.config.Constants.LabelFormat;
import pt.ua.tm.gimli.corpus.Sentence;
import pt.ua.tm.gimli.corpus.Token;
import pt.ua.tm.gimli.exception.GimliException;
import pt.ua.tm.gimli.model.CRFModel;
import pt.ua.tm.gimli.config.ModelConfig;
import pt.ua.tm.gimli.processing.Abbreviation;
import pt.ua.tm.gimli.processing.Parentheses;

/**
 * Annotate and write the result into a file following the JNLPBA format.
 *
 * @author David Campos
 * (<a href="mailto:david.campos@ua.pt">david.campos@ua.pt</a>)
 * @version 1.0
 * @since 1.0
 */
public class JNLPBAWriter implements ICorpusWriter {

    /**
     * {@link Logger} to be used in the class.
     */
    private static Logger logger = LoggerFactory.getLogger(JNLPBAWriter.class);
    /**
     * Help message.
     */
    private static final String MODEL_HELP = "Please follow the format: [file],[entity],[parsing],[features]\n"
            + "file: File with model;\n" + "entity: protein, DNA, RNA, cell_line or cell_type;\n"
            + "parsing: fw (forward) or bw (backward);\n" + "features: File with features configuration.";

    /**
     * Write annotation result in the JNLPBA format using only one entity type.
     * @param corpus The corpus.
     * @param file The file to store the result.
     * @throws GimliException Problem writing into the file.
     */
    @Override
    public void write(final Corpus corpus, final String file) throws GimliException {
        Sentence s;
        Token t;
        String medline = "", lastmedline = "";
        try {
            FileOutputStream out = new FileOutputStream(file);
            for (int i = 0; i < corpus.size(); i++) {
                s = corpus.getSentence(i);
                medline = s.getId();

                if (!medline.equals(lastmedline)) {
                    out.write(medline.getBytes());
                    out.write("\n".getBytes());
                    out.write("\n".getBytes());
                }

                for (int j = 0; j < s.size(); j++) {
                    t = s.getToken(j);
                    out.write(t.getText().getBytes());
                    out.write("\t".getBytes());
                    if (!t.getLabel().equals(LabelTag.O)) {
                        out.write((t.getLabel().toString() + "-" + corpus.getEntity().toString()).getBytes());
                    } else {
                        out.write(t.getLabel().toString().getBytes());
                    }
                    out.write("\n".getBytes());
                }
                out.write("\n".getBytes());

                lastmedline = medline;
            }
            out.close();
        } catch (IOException ex) {
            throw new GimliException("It was not possible to write the annotated corpus in the file.", ex);
        }
    }

    /**
     * Write annotation result in the JNLPBA format, combining annotations of
     * various entity types.
     * @param corpora The annotated corpora.
     * @param file The file to store the result.
     * @throws GimliException Problem writing in the output file.
     */
    public void write(final Corpus[] corpora, final String file) throws GimliException {
        if (corpora.length < 2) {
            return;
        }
        try {
            FileOutputStream out = new FileOutputStream(file);
            ArrayList<Annotation> annotations;
            Sentence s;
            String medline = "", lastmedline = "";
            for (int i = 0; i < corpora[0].size(); i++) {
                s = corpora[0].getSentence(i);
                medline = s.getId();

                // Collect all annotations for this sentence
                annotations = new ArrayList<Annotation>();
                for (int j = 0; j < corpora.length; j++) {
                    s = corpora[j].getSentence(i);
                    for (int k = 0; k < s.getNumberAnnotations(); k++) {
                        annotations.add(s.getAnnotation(k));
                    }
                }

                // Sort annotations
                Collections.sort(annotations, new AnnotationComparator());

                // Choose the best annotation for the same tokens
                ArrayList<Annotation> intersections;
                ArrayList<Annotation> unique = new ArrayList<Annotation>();
                for (int j = 0; j < annotations.size(); j++) {
                    Annotation a = annotations.get(j);

                    intersections = new ArrayList<Annotation>();
                    intersections.add(a);
                    int lastMatch = j;
                    for (int k = j + 1; k < annotations.size(); k++) {
                        Annotation an = annotations.get(k);
                        if (a.getStartIndex() <= an.getEndIndex() && a.getEndIndex() >= an.getStartIndex()) {
                            intersections.add(an);
                            lastMatch = k;
                        }
                    }

                    Annotation best = intersections.get(0);
                    for (int k = 1; k < intersections.size(); k++) {
                        a = intersections.get(k);
                        if (a.getScore() > best.getScore()) {
                            best = a;
                        }
                    }
                    unique.add(best);

                    j = lastMatch;
                }

                // Sort annotations
                Collections.sort(unique, new AnnotationComparator());

                // Write result to output
                if (!medline.equals(lastmedline)) {
                    out.write(medline.getBytes());
                    out.write("\n".getBytes());
                    out.write("\n".getBytes());
                }

                s = corpora[0].getSentence(i);

                String[] labels = new String[s.size()];
                for (int j = 0; j < labels.length; j++) {
                    labels[j] = LabelTag.O.toString();
                }

                for (int j = 0; j < unique.size(); j++) {
                    Annotation a = unique.get(j);
                    String entity = a.getSentence().getCorpus().getEntity().toString();
                    for (int k = a.getStartIndex(); k <= a.getEndIndex(); k++) {
                        if (k == a.getStartIndex()) {
                            labels[k] = LabelTag.B.toString() + "-" + entity;
                        } else {
                            labels[k] = LabelTag.I.toString() + "-" + entity;
                        }
                    }
                }

                for (int j = 0; j < s.size(); j++) {
                    out.write(s.getToken(j).getText().getBytes());
                    out.write("\t".getBytes());
                    out.write(labels[j].getBytes());
                    out.write("\n".getBytes());
                }
                out.write("\n".getBytes());
                lastmedline = medline;
            }
            out.close();
        } catch (IOException ex) {
            throw new GimliException("There was a problem writing in the output file.", ex);
        }
    }

    /**
     * Print help message of the program.
     * @param options Command line arguments.
     * @param msg Message to be displayed.
     */
    private static void printHelp(Options options, String msg) {
        logger.error(msg);
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("./gimli.sh annotate JNLPBA", options);
    }

    /**
     * Main program annotate a corpus using one or several models, and save the
     * result in the JNLPBA format.
     * @param args Command line arguments.
     */
    public static void main(String[] args) {
        // Start stopwatch
        StopWatch watch = new StopWatch();
        watch.start();

        CommandLineParser parser = new GnuParser();
        Options options = new Options();
        options.addOption("h", "help", false, "Print this usage information.");

        options.addOption("c", "corpus", true, "File with the corpus in the CoNNL format.");

        Option o = new Option("m", "model", true, MODEL_HELP);
        o.setArgs(Integer.MAX_VALUE);

        options.addOption(o);

        options.addOption("o", "output", true, "File to save the annotated corpus.");
        CommandLine commandLine = null;
        try {
            // Parse the program arguments
            commandLine = parser.parse(options, args);
        } catch (ParseException ex) {
            logger.error("There was a problem processing the input arguments.", ex);
            return;
        }

        // Show help text
        if (commandLine.hasOption('h')) {
            printHelp(options, "");
            return;
        }

        // Get corpus
        String corpus = null;
        if (commandLine.hasOption('c')) {
            corpus = commandLine.getOptionValue('c');
        } else {
            printHelp(options, "Please specify the corpus file.");
            return;
        }

        // Get models
        String[] input = null;
        if (commandLine.hasOption('m')) {
            input = commandLine.getOptionValues('m');
        } else {
            printHelp(options, "Please specify the models.");
            return;
        }

        // Parse models characteristics
        String[] models = new String[input.length];
        EntityType[] entities = new EntityType[input.length];
        String[] features = new String[input.length];
        Parsing[] parsing = new Parsing[input.length];

        String[] strs;
        for (int i = 0; i < input.length; i++) {
            strs = input[i].split(",");

            if (strs.length != 4) {
                printHelp(options, "Wrong input format for models.");
                return;
            }

            models[i] = strs[0].trim();
            entities[i] = EntityType.valueOf(strs[1].trim());
            parsing[i] = Parsing.valueOf(strs[2].trim().toUpperCase());
            features[i] = strs[3].trim();
        }

        // Get output
        String output = null;
        if (commandLine.hasOption('o')) {
            output = commandLine.getOptionValue('o');
        } else {
            printHelp(options, "Please specify the output file.");
            return;
        }

        // Check length consistency
        if ((features.length != models.length) || (features.length != entities.length)
                || (features.length != parsing.length)) {
            logger.error(
                    "The number of feature files, parsing, entities and models are different from each other.");
            return;
        }

        // Give feedback
        logger.info("Corpus: {}", corpus);
        logger.info("Models:");
        for (int i = 0; i < models.length; i++) {
            logger.info("\t{}: {}, {}, {}, {}",
                    new Object[] { i + 1, models[i], entities[i], features[i], parsing[i] });
        }
        logger.info("Output: {}", output);

        // Load model configurations
        ModelConfig[] mc = new ModelConfig[features.length];
        for (int i = 0; i < features.length; i++) {
            mc[i] = new ModelConfig(features[i]);
        }

        // Get unique entities
        ArrayList<EntityType> uniqueEntities = new ArrayList<EntityType>();
        for (int i = 0; i < entities.length; i++) {
            if (!uniqueEntities.contains(entities[i])) {
                uniqueEntities.add(entities[i]);
            }
        }

        // Load Corpus
        Corpus[] c = new Corpus[uniqueEntities.size()];
        try {
            for (int i = 0; i < uniqueEntities.size(); i++) {
                c[i] = new Corpus(LabelFormat.BIO, uniqueEntities.get(i), corpus);
            }
        } catch (GimliException ex) {
            logger.error("There was a problem loading the corpus.", ex);
            return;
        }

        // Combine models of the same entity
        for (int j = 0; j < uniqueEntities.size(); j++) {
            ArrayList<CRFModel> crfmodels = new ArrayList<CRFModel>();

            try {
                for (int i = 0; i < entities.length; i++) {
                    if (entities[i].equals(uniqueEntities.get(j))) {
                        crfmodels.add(new CRFModel(mc[i], parsing[i], models[i]));
                    }
                }
            } catch (GimliException ex) {
                logger.error("There was a problem loading a model.", ex);
                return;
            }

            Annotator a = new Annotator(c[j]);
            if (crfmodels.size() > 1) {

                logger.info("Annotating the corpus by combining {} models for {}...", crfmodels.size(),
                        uniqueEntities.get(j));
                a.annotate(crfmodels.toArray(new CRFModel[0]));
            } else {
                logger.info("Annotating the corpus using 1 model for {}...", uniqueEntities.get(j));
                a.annotate(crfmodels.get(0));
            }
        }

        // Post-processing
        for (int i = 0; i < c.length; i++) {
            Parentheses.processRemoving(c[i]);
            Abbreviation.process(c[i]);
        }

        // Write annotations to file
        logger.info("Writing the corpus in the JNLPBA format...");
        JNLPBAWriter writer = new JNLPBAWriter();

        try {
            if (c.length > 1) {
                //Annotator.writeJNLPBACombined(c, output);
                writer.write(c, output);
            } else {
                writer.write(c[0], output);
                //Annotator.writeJNLPBA(c[0], output);
            }
        } catch (GimliException ex) {
            logger.error("There was a problem writing the corpus to file.", ex);
            return;
        }

        // Time feedback
        watch.stop();
        logger.info("Done!");
        logger.info("Total time: {}", watch.toString());

        double time = (double) watch.getTime();
        double size = (double) c[0].size();

        double perSentence = time / size;
        double perAbstract = time / (size / 10.0);
        double perHundred = time / (size / 100.0);

        logger.info("Per sentence: {} seconds", String.format("%2.4f", perSentence / 1000.0));
        logger.info("Per abstract: {} seconds", String.format("%2.4f", perAbstract / 1000.0));
        logger.info("10 abstracts: {} seconds", String.format("%2.4f", perHundred / 1000.0));
    }
}