pt.ua.tm.neji.train.batch.TrainBatchExecutor.java Source code

Java tutorial

Introduction

Here is the source code for pt.ua.tm.neji.train.batch.TrainBatchExecutor.java

Source

/*
 * Copyright (c) 2016 BMD Software and University of Aveiro.
 *
 * Neji is a flexible and powerful platform for biomedical information extraction from text.
 *
 * This project is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
 * To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/.
 *
 * This project is a free software, you are free to copy, distribute, change and transmit it.
 * However, you may not use it for commercial purposes.
 *
 * It is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package pt.ua.tm.neji.train.batch;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.Validate;
import org.apache.commons.lang.reflect.ConstructorUtils;
import org.apache.commons.lang.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.ua.tm.neji.batch.FileBatchExecutor;
import pt.ua.tm.neji.context.Context;
import pt.ua.tm.neji.context.InputFormat;
import pt.ua.tm.neji.core.Constants.LabelFormat;
import pt.ua.tm.neji.core.batch.BatchExecutor;
import pt.ua.tm.neji.core.corpus.Corpus;
import pt.ua.tm.neji.core.processor.Processor;
import pt.ua.tm.neji.exception.NejiException;
import pt.ua.tm.neji.processor.filewrappers.InputFile;
import pt.ua.tm.neji.train.context.TrainContext;
import pt.ua.tm.neji.train.util.A1Utils;
import pt.ua.tm.neji.train.util.A1Utils.A1Pairs;

/**
 *
 * @author jeronimo
 */
public class TrainBatchExecutor extends BatchExecutor {

    // Attributes
    private static Logger logger = LoggerFactory.getLogger(FileBatchExecutor.class);
    private String inputSentencesFilePath;
    private String inputAnnotationsFilePath;
    private int numThreads;

    /**
     * Constructor. Phase 1
     * @param inputSentencesFilePath input sentences file path
     * @param inputAnnotationsFilePath input annotations file path
     * @param numThreads number of threads to use
     */
    public TrainBatchExecutor(final String inputSentencesFilePath, final String inputAnnotationsFilePath,
            int numThreads) {
        this.inputSentencesFilePath = inputSentencesFilePath;
        this.inputAnnotationsFilePath = inputAnnotationsFilePath;
        this.numThreads = numThreads;
    }

    /**
     * Constructor. Phase 2
     */
    public TrainBatchExecutor() {
        this(null, null, 1);
    }

    @Override
    public void run(Class<? extends Processor> processorCls, Context context, Object... args) throws NejiException {

        StopWatch timer = new StopWatch();

        logger.info("Initializing context...");
        context.initialize();

        timer.start();

        if (((TrainContext) context).getPhase() == 1) { // Phase 1            

            // If input format requires annotations
            if (context.getConfiguration().getInputFormat().equals(InputFormat.BC2)) { // File + Annotations formats
                processFiles(inputSentencesFilePath, inputAnnotationsFilePath, (TrainContext) context, processorCls,
                        args);
            } else if (context.getConfiguration().getInputFormat().equals(InputFormat.A1)) { // Folder format
                processMultipleFiles(inputSentencesFilePath, numThreads, (TrainContext) context, processorCls,
                        args);
            } else { // File formats 
                processFiles(inputSentencesFilePath, (TrainContext) context, processorCls, args);
            }
        } else { // Phase 2

            // In this case inputSentencesFilePath contains the path to the corpus
            processFiles2((TrainContext) context, processorCls, args);
        }

        logger.info("Terminating context...");
        context.terminate();

        timer.stop();
        logger.info("Processed files in {}", timer.toString());
    }

    @Override
    public Collection<Corpus> getProcessedCorpora() {
        return null;
    }

    private void processFiles(final String inputSentencesFilePath, final String inputAnnotationsFilePath,
            TrainContext context, final Class<? extends Processor> processorCls, Object... args) {

        File sentencesFile = new File(inputSentencesFilePath);
        File annotationsFile = new File(inputAnnotationsFilePath);

        // Make corpus
        Corpus corpus = new Corpus(LabelFormat.BIO, context.getEntity());

        // Set corpus identifier
        corpus.setIdentifier(FilenameUtils.getBaseName(sentencesFile.getName()));

        // Make in/out corpus wrappers
        InputFile inputSentencesFile = new InputFile(corpus, sentencesFile, false);
        InputFile inputAnnotationsFile = new InputFile(corpus, annotationsFile, false);

        logger.info("");
        logger.info("Started processing...");

        Processor processor;
        try {
            processor = newProcessor(processorCls, context, inputSentencesFile, inputAnnotationsFile, args);
        } catch (NejiException ex) {
            String m = "There was a problem creating the processor";
            logger.error(m, ex);
            throw new RuntimeException(m, ex);
        }

        // Run processor
        processor.run();
    }

    private void processFiles(final String inputSentencesFilePath, TrainContext context,
            final Class<? extends Processor> processorCls, Object... args) {

        File sentencesFile = new File(inputSentencesFilePath);

        // Make corpus
        Corpus corpus = new Corpus(LabelFormat.BIO, context.getEntity());

        // // Set corpus identifier
        corpus.setIdentifier(FilenameUtils.getBaseName(sentencesFile.getName()));

        // Make in/out corpus wrappers
        InputFile inputSentencesFile = new InputFile(corpus, sentencesFile, false);

        logger.info("");
        logger.info("Started processing...");

        Processor processor;
        try {
            processor = newProcessor(processorCls, context, inputSentencesFile, args);
        } catch (NejiException ex) {
            String m = "There was a problem creating the processor";
            logger.error(m, ex);
            throw new RuntimeException(m, ex);
        }

        // Run processor
        processor.run();
    }

    /**
     * Phase 2 process files
     * @param corpusPath
     * @param context
     * @param processorCls
     * @param args 
     */
    private void processFiles2(TrainContext context, final Class<? extends Processor> processorCls,
            Object... args) {

        // Make corpus
        Corpus corpus = new Corpus(LabelFormat.BIO, context.getEntity());

        // Lets put it as the corpus path
        corpus.setIdentifier("");

        // Temporary file (to mantain the structure)
        File tmpFile = new File("tmp.txt");

        try {
            PrintWriter pwt = new PrintWriter(tmpFile);
            pwt.write(" ");
            pwt.close();
        } catch (IOException ex) {
            System.out.println("Error: An error ocurred while creating the temporary file. " + ex.getMessage());
        }

        // Make in/out corpus wrappers
        InputFile inputCorpusPathFile = new InputFile(corpus, tmpFile, false);

        logger.info("");
        logger.info("Started processing...");

        Processor processor;
        try {
            processor = newProcessor(processorCls, context, inputCorpusPathFile, args);
        } catch (NejiException ex) {
            String m = "There was a problem creating the processor";
            logger.error(m, ex);
            throw new RuntimeException(m, ex);
        }

        // Run processor
        processor.run();
    }

    // For multiple files processing
    private int processMultipleFiles(final String inputFolderPath, final int numThreads, TrainContext context,
            final Class<? extends Processor> processorCls, Object... args) {

        int filesProcessed = 0;

        // Getting folder files
        File inputFolder = new File(inputFolderPath);
        File[] files = inputFolder.listFiles();
        File[] annotations = null;
        boolean hasAnnotationFiles = false;

        // Determine if format is A1, to separate .txt and .a1 files, before processing
        if (context.getConfiguration().getInputFormat().equals(InputFormat.A1)) {
            A1Pairs a1Pairs = A1Utils.separateTextAnnotations(files);
            files = a1Pairs.getFiles();
            annotations = a1Pairs.getAnnotations();
            hasAnnotationFiles = true;
        }

        // Multi-threading 
        try {
            logger.info("Installing multi-threading support...");
            context.addMultiThreadingSupport(numThreads);
        } catch (NejiException ex) {
            String m = "There was a problem installing multi-threading support.";
            logger.error(m, ex);
            throw new RuntimeException(m, ex);
        }

        // Start thread pool
        logger.info("Starting thread pool with support for {} threads...", numThreads);
        ExecutorService executor = Executors.newFixedThreadPool(numThreads);

        LinkedList<Future> futures = new LinkedList<>();

        // Iterate over files
        for (int i = 0; i < files.length; i++) {

            // Make corpus, output file
            Corpus corpus = new Corpus(LabelFormat.BIO, context.getEntity());

            // By default, the corpus identifier is the file name
            corpus.setIdentifier(FilenameUtils.getBaseName(files[i].getName()));

            // Make in/out corpus wrappers
            InputFile inputSentencesFile = new InputFile(corpus, files[i], false);
            InputFile inputAnnotationsFile = null;

            // Verify ih has annotation files
            if (hasAnnotationFiles) {
                inputAnnotationsFile = new InputFile(corpus, annotations[i], false);
            }

            Processor processor;
            try {
                if (!hasAnnotationFiles)
                    processor = newProcessor(processorCls, context, inputSentencesFile, args);
                else
                    processor = newProcessor(processorCls, context, inputSentencesFile, inputAnnotationsFile, args);
            } catch (NejiException ex) {
                String m = "There was a problem creating the processor of the file: " + files[i].getAbsolutePath();
                logger.error(m, ex);
                throw new RuntimeException(m, ex);
            }

            Future submit = executor.submit(processor);
            futures.add(submit);
        }

        logger.info("");
        logger.info("{} file(s) to process.", futures.size());
        logger.info("Started processing...");

        Iterator<Future> it = futures.iterator();
        while (it.hasNext()) {
            Future future = it.next();
            try {
                Object o = future.get();
                future = null;
                it.remove();
                filesProcessed++;
            } catch (ExecutionException | InterruptedException ex) {
                String m = "There was a problem running the processor.";
                logger.error(m, ex);
            }
        }

        executor.shutdown();
        try {
            executor.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
        } catch (InterruptedException ex) {
            String m = "There was a problem executing the processing tasks.";
            logger.error(m, ex);
            throw new RuntimeException(m, ex);
        }

        return filesProcessed;
    }

    protected static <A, B, C> Processor newProcessor(final Class<? extends Processor> processorCls,
            final Context context, final A sentencesInput, final B annotationsInput, final Object... args)
            throws NejiException {

        Validate.notNull(processorCls);
        Validate.notNull(context);
        Validate.notNull(sentencesInput);
        Validate.notNull(annotationsInput);

        int numberArgs = 3 + (args != null ? args.length : 0);
        List<Object> values = new ArrayList<>(numberArgs);
        values.add(context);
        values.add(sentencesInput);
        values.add(annotationsInput);

        List<Class> types = new ArrayList<>(numberArgs);
        types.add(context.getClass());
        types.add(sentencesInput.getClass());
        types.add(annotationsInput.getClass());

        if (args != null) {
            for (Object arg : args) {
                values.add(arg);
                types.add(arg.getClass());
            }
        }

        try {
            return (Processor) ConstructorUtils.invokeConstructor(processorCls, values.toArray(),
                    types.toArray(new Class[types.size()]));
        } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException
                | InstantiationException ex) {
            logger.error("Error creating new pipeline processor.", ex);
            throw new NejiException("Error creating new pipeline processor.", ex);
        }
    }

    protected static <A, C> Processor newProcessor(final Class<? extends Processor> processorCls,
            final Context context, final A sentencesInput, final Object... args) throws NejiException {

        Validate.notNull(processorCls);
        Validate.notNull(context);
        Validate.notNull(sentencesInput);

        int numberArgs = 2 + (args != null ? args.length : 0);
        List<Object> values = new ArrayList<>(numberArgs);
        values.add(context);
        values.add(sentencesInput);

        List<Class> types = new ArrayList<>(numberArgs);
        types.add(context.getClass());
        types.add(sentencesInput.getClass());

        if (args != null) {
            for (Object arg : args) {
                values.add(arg);
                types.add(arg.getClass());
            }
        }

        try {
            return (Processor) ConstructorUtils.invokeConstructor(processorCls, values.toArray(),
                    types.toArray(new Class[types.size()]));
        } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException
                | InstantiationException ex) {
            logger.error("Error creating new pipeline processor.", ex);
            throw new NejiException("Error creating new pipeline processor.", ex);
        }
    }
}