pt.ua.tm.gimli.writer.BCWriter.java Source code

Java tutorial

Introduction

Here is the source code for pt.ua.tm.gimli.writer.BCWriter.java

Source

/*
 * Gimli - High-performance and multi-corpus recognition of biomedical
 * entity names
 *
 * Copyright (C) 2011 David Campos, Universidade de Aveiro, Instituto de
 * Engenharia Electrnica e Telemtica de Aveiro
 *
 * Gimli is licensed under the Creative Commons
 * Attribution-NonCommercial-ShareAlike 3.0 Unported License. To view a copy of
 * this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/.
 *
 * Gimli is a free software, you are free to copy, distribute, change and
 * transmit it. However, you may not use Gimli for commercial purposes.
 *
 * Gimli is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.
 *
 */
package pt.ua.tm.gimli.writer;

import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.ua.tm.gimli.annotator.Annotator;
import pt.ua.tm.gimli.corpus.Corpus;
import pt.ua.tm.gimli.config.Constants.Parsing;
import pt.ua.tm.gimli.config.Constants.EntityType;
import pt.ua.tm.gimli.config.Constants.LabelFormat;
import pt.ua.tm.gimli.exception.GimliException;
import pt.ua.tm.gimli.model.CRFModel;
import pt.ua.tm.gimli.config.ModelConfig;
import pt.ua.tm.gimli.corpus.Annotation;
import pt.ua.tm.gimli.corpus.Sentence;
import pt.ua.tm.gimli.processing.Abbreviation;
import pt.ua.tm.gimli.processing.Parentheses;

/**
 * Annotate and write the result into a file following the BioCreative format.
 *
 * @author David Campos
 * (<a href="mailto:david.campos@ua.pt">david.campos@ua.pt</a>)
 * @version 1.0
 * @since 1.0
 */
public class BCWriter implements ICorpusWriter {

    /**
     * {@link Logger} to be used in the class.
     */
    private static Logger logger = LoggerFactory.getLogger(BCWriter.class);
    /**
     * Help message.
     */
    private static final String MODEL_HELP = "Please follow the format: [file],[parsing],[features]\n"
            + "file: File with model;\n" + "parsing: fw (forward) or bw (backward);\n"
            + "features: File with features configuration.";

    /**
     * Print help message of the program.
     * @param options Command line arguments.
     * @param msg Message to be displayed.
     */
    private static void printHelp(Options options, String msg) {
        logger.error(msg);
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("./gimli.sh annotate BC2", options);
    }

    /**
     * Convert the {@link Sentence} to the BioCreative format.
     * @param s The sentence to be converted.
     * @return The string that represents the sentence in the BioCreative
     * format.
     */
    private String sentenceToBCFormat(final Sentence s) {
        StringBuilder sb = new StringBuilder();

        Annotation a;
        for (int i = 0; i < s.getNumberAnnotations(); i++) {
            a = s.getAnnotation(i);

            String text = a.getText();
            int startChar = s.getToken(a.getStartIndex()).getStart();
            int endChar = s.getToken(a.getEndIndex()).getEnd();

            // Add annotation
            sb.append(s.getId());
            sb.append("|");
            sb.append(startChar);
            sb.append(" ");
            sb.append(endChar);
            sb.append("|");
            sb.append(text);
            sb.append("\n");
        }

        return sb.toString();
    }

    /**
     * Write annotation result in the BioCreative format.
     * @param corpus The corpus.
     * @param file The file to store the result.
     * @throws GimliException Problem writing into the file.
     */
    @Override
    public void write(final Corpus corpus, final String file) throws GimliException {
        try {
            FileOutputStream out = new FileOutputStream(file);
            for (int i = 0; i < corpus.size(); i++) {
                out.write(sentenceToBCFormat(corpus.getSentence(i)).getBytes());
            }
            out.close();
        } catch (IOException ex) {
            throw new GimliException("It was not possible to write the annotated corpus in the file.", ex);
        }
    }

    /**
     * Main program annotate a corpus using one or several models, and save the
     * result in the BioCreative format.
     * @param args Command line arguments.
     */
    public static void main(String[] args) {
        // Start stopwatch
        StopWatch watch = new StopWatch();
        watch.start();

        CommandLineParser parser = new GnuParser();
        Options options = new Options();
        options.addOption("h", "help", false, "Print this usage information.");

        options.addOption("c", "corpus", true, "File with the corpus in the CoNNL format.");

        Option o = new Option("m", "model", true, MODEL_HELP);
        o.setArgs(Integer.MAX_VALUE);

        options.addOption(o);

        options.addOption("o", "output", true, "File to save the annotated corpus.");
        CommandLine commandLine = null;
        try {
            // Parse the program arguments
            commandLine = parser.parse(options, args);
        } catch (ParseException ex) {
            logger.error("There was a problem processing the input arguments.", ex);
            return;
        }

        // Show help text
        if (commandLine.hasOption('h')) {
            printHelp(options, "");
            return;
        }

        // Get corpus
        String corpus = null;
        if (commandLine.hasOption('c')) {
            corpus = commandLine.getOptionValue('c');
        } else {
            printHelp(options, "Please specify the corpus file.");
            return;
        }

        // Get models
        String[] input = null;
        if (commandLine.hasOption('m')) {
            input = commandLine.getOptionValues('m');
        } else {
            printHelp(options, "Please specify the models.");
            return;
        }

        // Parse models characteristics
        String[] models = new String[input.length];
        String[] features = new String[input.length];
        Parsing[] parsing = new Parsing[input.length];

        String[] strs;
        for (int i = 0; i < input.length; i++) {
            strs = input[i].split(",");

            if (strs.length != 3) {
                printHelp(options, "Wrong input format for models.");
                return;
            }

            models[i] = strs[0].trim();
            parsing[i] = Parsing.valueOf(strs[1].trim().toUpperCase());
            features[i] = strs[2].trim();
        }

        // Get output
        String output = null;
        if (commandLine.hasOption('o')) {
            output = commandLine.getOptionValue('o');
        } else {
            printHelp(options, "Please specify the output file.");
            return;
        }

        // Check length consistency
        if ((features.length != models.length) || (features.length != parsing.length)) {
            logger.error(
                    "The number of feature files, parsing, entities and models are different from each other.");
            return;
        }

        // Give user feedback
        logger.info("Corpus: {}", corpus);
        logger.info("Models:");
        for (int i = 0; i < models.length; i++) {
            logger.info("\t{}: {}, {}, {}", new Object[] { i + 1, models[i], features[i], parsing[i] });
        }
        logger.info("Output: {}", output);

        // Load model configurations
        ModelConfig[] mc = new ModelConfig[features.length];
        for (int i = 0; i < features.length; i++) {
            mc[i] = new ModelConfig(features[i]);
        }

        // Load Corpus
        Corpus c = null;
        try {
            c = new Corpus(LabelFormat.BIO, EntityType.protein, corpus);
        } catch (GimliException ex) {
            logger.error("There was a problem loading the corpus", ex);
            return;
        }

        // Load models
        CRFModel[] crfmodels = new CRFModel[models.length];
        try {
            for (int i = 0; i < models.length; i++) {
                crfmodels[i] = new CRFModel(mc[i], parsing[i], models[i]);
            }
        } catch (GimliException ex) {
            logger.error("There was a problem loading the model(s)", ex);
            return;
        }

        Annotator a = new Annotator(c);
        if (crfmodels.length > 1) {
            // Annotate combining the models
            logger.info("Annotating the corpus by combining {} models... ", crfmodels.length);
            a.annotate(crfmodels);

        } else {
            // Annotate using only one model
            logger.info("Annotating the corpus using 1 model...");
            a.annotate(crfmodels[0]);
        }

        // Post-process annotations
        Parentheses.processRemoving(c);
        Abbreviation.process(c);

        // Write to file in the BC format
        BCWriter writer = new BCWriter();
        try {
            logger.info("Wrtiting annotated corpus into file: {}", output);
            writer.write(c, output);
        } catch (GimliException ex) {
            logger.error("There was a problem writing the corpus to file", ex);
        }

        // Time feedback
        watch.stop();
        logger.info("Done!");
        logger.info("Total time: {}", watch.toString());

        double time = (double) watch.getTime();
        double size = (double) c.size();

        double perSentence = time / size;
        double perAbstract = time / (size / 10.0);
        double perHundred = time / (size / 100.0);

        logger.info("Per sentence: {} seconds", String.format("%2.4f", perSentence / 1000.0));
        logger.info("Per abstract: {} seconds", String.format("%2.4f", perAbstract / 1000.0));
        logger.info("10 abstracts: {} seconds", String.format("%2.4f", perHundred / 1000.0));
    }
}