info.pancancer.arch3.worker.WorkerRunnable.java Source code

Java tutorial

Introduction

Here is the source code for info.pancancer.arch3.worker.WorkerRunnable.java

Source

/*
 *     Consonance - workflow software for multiple clouds
 *     Copyright (C) 2016 OICR
 *
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 *
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 *
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package info.pancancer.arch3.worker;

import info.pancancer.arch3.Base;
import info.pancancer.arch3.beans.Job;
import info.pancancer.arch3.beans.Status;
import info.pancancer.arch3.beans.StatusState;
import info.pancancer.arch3.utils.Constants;
import info.pancancer.arch3.utils.Utilities;
import io.cloudbindle.youxia.util.Log;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.Inet4Address;
import java.net.InetAddress;
import java.net.NetworkInterface;
import java.net.SocketException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.FileAttribute;
import java.nio.file.attribute.PosixFilePermission;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.commons.configuration.HierarchicalINIConfiguration;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecuteResultHandler;
import org.apache.commons.exec.DefaultExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.rabbitmq.client.AlreadyClosedException;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.MessageProperties;
import com.rabbitmq.client.QueueingConsumer;

/**
 * This class represents a WorkerRunnable, in the Architecture 3 design.
 *
 * A WorkerRunnable can receive job messages from a queue and execute a seqware workflow based on the contents of the job message. Created
 * by boconnor on 15-04-18.
 */
public class WorkerRunnable implements Runnable {

    private static final String NO_MESSAGE_FROM_QUEUE_MESSAGE = " [x] Job request came back null/empty! ";
    protected final Logger log = LoggerFactory.getLogger(getClass());
    private HierarchicalINIConfiguration settings = null;
    private Channel resultsChannel = null;
    private String queueName = null;
    private String jobQueueName;
    private String resultsQueueName;
    private String vmUuid = null;
    private int maxRuns = 1;
    private String userName;
    private boolean testMode;
    private boolean endless = false;
    public static final int DEFAULT_PRESLEEP = 1;
    public static final int DEFAULT_POSTSLEEP = 1;
    private String networkAddress;

    /**
     * Create a new Worker.
     *
     * @param configFile
     *            - The name of the configuration file to read.
     * @param vmUuid
     *            - The UUID of the VM on which this worker is running.
     * @param maxRuns
     *            - The maximum number of workflows this Worker should execute.
     */
    public WorkerRunnable(String configFile, String vmUuid, int maxRuns) {
        this(configFile, vmUuid, maxRuns, false, false);
    }

    /**
     * Create a new Worker.
     *
     * @param configFile
     *            - The name of the configuration file to read.
     * @param vmUuid
     *            - The UUID of the VM on which this worker is running.
     * @param maxRuns
     *            - The maximum number of workflows this Worker should execute.
     * @param testMode
     *            - Should this worker run in testMode (seqware job will not actually be launched)
     * @param endless
     *            - have the worker pick up new jobs as the current job finishes successfully
     */
    public WorkerRunnable(String configFile, String vmUuid, int maxRuns, boolean testMode, boolean endless) {
        log.debug("WorkerRunnable created with args:\n\tconfigFile: " + configFile + "\n\tvmUuid: " + vmUuid
                + "\n\tmaxRuns: " + maxRuns + "\n\ttestMode: " + testMode + "\n\tendless: " + endless);

        try {
            this.networkAddress = getFirstNonLoopbackAddress().toString().substring(1);
        } catch (SocketException e) {
            // TODO Auto-generated catch block
            log.error("Could not get network address: " + e.getMessage(), e);
            throw new RuntimeException("Could not get network address: " + e.getMessage());
        }

        this.maxRuns = maxRuns;
        settings = Utilities.parseConfig(configFile);

        // TODO: Dyanmically change path to log file, it should be /var/log/arch3.log in production, but for test, ./arch3.log
        // FileAppender<ILoggingEvent> appender = (FileAppender<ILoggingEvent>)
        // ((ch.qos.logback.classic.Logger)log).getAppender("FILE_APPENDER");
        // appender.setFile("SomePath");

        this.queueName = settings.getString(Constants.RABBIT_QUEUE_NAME);
        if (this.queueName == null) {
            throw new NullPointerException(
                    "Queue name was null! Please ensure that you have properly configured \"rabbitMQQueueName\" in your config file.");
        }
        this.jobQueueName = this.queueName + "_jobs";
        this.resultsQueueName = this.queueName + "_results";
        this.userName = settings.getString(Constants.WORKER_HOST_USER_NAME, "ubuntu");
        /*
         * If the user specified "--endless" on the CLI, then this.endless=true Else: check to see if "endless" is in the config file, and
         * if it is, parse the value of it and use that. If not in the config file, then use "false".
         */
        this.endless = endless ? endless : settings.getBoolean(Constants.WORKER_ENDLESS, false);
        if (this.endless) {
            log.info("The \"--endless\" flag was set, this worker will run endlessly!");
        }
        this.vmUuid = vmUuid;
        this.maxRuns = maxRuns;
        this.testMode = testMode;
    }

    @Override
    public void run() {

        int max = maxRuns;

        try {
            // the VM UUID
            log.info(" WORKER VM UUID provided as: '" + vmUuid + "'");
            // write to
            // TODO: Add some sort of "local debug" mode so that developers working on their local
            // workstation can declare the queue if it doesn't exist. Normally, the results queue is
            // created by the Coordinator.
            resultsChannel = Utilities.setupExchange(settings, this.resultsQueueName);

            while (max > 0 || this.endless) {
                log.debug(max + " remaining jobs will be executed");
                log.info(" WORKER IS PREPARING TO PULL JOB FROM QUEUE " + this.jobQueueName);

                if (!endless) {
                    max--;
                }

                // jobChannel needs to be created inside the loop because it is closed inside the loop, and it is closed inside this loop to
                // prevent pre-fetching.
                Channel jobChannel = Utilities.setupQueue(settings, this.jobQueueName);
                if (jobChannel == null) {
                    throw new NullPointerException("jobChannel is null for queue: " + this.jobQueueName
                            + ". Something bad must have happened while trying to set up the queue connections. Please ensure that your configuration is correct.");
                }
                QueueingConsumer consumer = new QueueingConsumer(jobChannel);
                jobChannel.basicConsume(this.jobQueueName, false, consumer);

                QueueingConsumer.Delivery delivery = consumer.nextDelivery();
                log.info(vmUuid + "  received " + delivery.getEnvelope().toString());
                if (delivery.getBody() != null) {
                    String message = new String(delivery.getBody(), StandardCharsets.UTF_8);
                    if (message.trim().length() > 0) {

                        log.info(" [x] Received JOBS REQUEST '" + message + "' @ " + vmUuid);

                        Job job = new Job().fromJSON(message);

                        Status status = new Status(vmUuid, job.getUuid(), StatusState.RUNNING,
                                Utilities.JOB_MESSAGE_TYPE, "job is starting", this.networkAddress);
                        status.setStderr("");
                        status.setStdout("");
                        String statusJSON = status.toJSON();

                        log.info(" WORKER LAUNCHING JOB");

                        // greedy acknowledge, it will be easier to deal with lost jobs than zombie workers in hostile OpenStack
                        // environments
                        log.info(vmUuid + " acknowledges " + delivery.getEnvelope().toString());
                        jobChannel.basicAck(delivery.getEnvelope().getDeliveryTag(), false);
                        // we need to close the channel IMMEDIATELY to complete the ACK.
                        jobChannel.close();
                        // Close the connection object as well, or the main thread may not exit because of still-open-and-in-use resources.
                        jobChannel.getConnection().close();

                        WorkflowResult workflowResult = new WorkflowResult();
                        if (testMode) {
                            workflowResult.setWorkflowStdout("everything is awesome");
                            workflowResult.setExitCode(0);
                        } else {
                            String seqwareEngine = settings.getString(Constants.WORKER_SEQWARE_ENGINE,
                                    Constants.SEQWARE_WHITESTAR_ENGINE);
                            String seqwareSettingsFile = settings.getString(Constants.WORKER_SEQWARE_SETTINGS_FILE);
                            String dockerImage = settings.getString(Constants.WORKER_SEQWARE_DOCKER_IMAGE_NAME);
                            workflowResult = launchJob(statusJSON, job, seqwareEngine, seqwareSettingsFile,
                                    dockerImage);
                        }

                        status = new Status(vmUuid, job.getUuid(),
                                workflowResult.getExitCode() == 0 ? StatusState.SUCCESS : StatusState.FAILED,
                                Utilities.JOB_MESSAGE_TYPE, "job is finished", networkAddress);
                        status.setStderr(workflowResult.getWorkflowStdErr());
                        status.setStdout(workflowResult.getWorkflowStdout());
                        statusJSON = status.toJSON();

                        log.info(" WORKER FINISHING JOB");

                        finishJob(statusJSON);
                    } else {
                        log.info(NO_MESSAGE_FROM_QUEUE_MESSAGE);
                    }
                    // we need to close the channel *conditionally*
                    if (jobChannel.isOpen()) {
                        jobChannel.close();
                    }
                    // Close the connection object as well, or the main thread may not exit because of still-open-and-in-use resources.
                    if (jobChannel.getConnection().isOpen()) {
                        jobChannel.getConnection().close();
                    }
                } else {
                    log.info(NO_MESSAGE_FROM_QUEUE_MESSAGE);
                }

                if (endless) {
                    log.info("attempting to reset workspace");
                    DefaultExecutor executor = new DefaultExecutor();
                    DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler();
                    // attempt a cleanup
                    CommandLine cli = new CommandLine("sudo");
                    List<String> args = new ArrayList<>(Arrays.asList("rm", "-rf", "/datastore/*"));
                    cli.addArguments(args.toArray(new String[args.size()]));
                    executor.execute(cli, resultHandler);
                    // Use the result handler for non-blocking call, so this way we should be able to get updates of
                    // stdout and stderr while the command is running.
                    resultHandler.waitFor();
                    log.info("exit code for cleanup: " + resultHandler.getExitValue());
                }
            }
            log.info(" \n\n\nWORKER FOR VM UUID HAS FINISHED!!!: '" + vmUuid + "'\n\n");

            // turns out this is needed when multiple threads are reading from the same
            // queue otherwise you end up with multiple unacknowledged messages being undeliverable to other workers!!!
            if (resultsChannel != null && resultsChannel.isOpen()) {
                resultsChannel.close();
                resultsChannel.getConnection().close();
            }
            log.debug("result channel open: " + resultsChannel.isOpen());
            log.debug("result channel connection open: " + resultsChannel.getConnection().isOpen());
        } catch (Exception ex) {
            log.error(ex.getMessage(), ex);
        }
    }

    /**
     * Write the content of the job object to an INI file which will be used by the workflow.
     *
     * @param job
     *            - the job object which must contain a HashMap, which will be used to write an INI file.
     * @return A Path object pointing to the new file will be returned.
     * @throws IOException
     */
    private Path writeINIFile(Job job) throws IOException {
        log.info("INI is: " + job.getIniStr());
        EnumSet<PosixFilePermission> perms = EnumSet.of(PosixFilePermission.OWNER_READ,
                PosixFilePermission.OWNER_WRITE, PosixFilePermission.OWNER_EXECUTE, PosixFilePermission.GROUP_READ,
                PosixFilePermission.GROUP_WRITE, PosixFilePermission.OTHERS_READ, PosixFilePermission.OTHERS_WRITE);
        FileAttribute<?> attrs = PosixFilePermissions.asFileAttribute(perms);
        Path pathToINI = Files.createTempFile("seqware_", ".ini", attrs);
        log.info("INI file: " + pathToINI.toString());
        try (BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(pathToINI.toFile()), StandardCharsets.UTF_8))) {
            bw.write(job.getIniStr());
            bw.flush();
        }
        return pathToINI;
    }

    // TODO: obviously, this will need to launch something using Youxia in the future
    /**
     * This function will execute a workflow, based on the content of the Job object that is passed in.
     *
     * @param message
     *            - The message that will be published on the queue when the worker starts running the job.
     * @param job
     *            - The job contains information about what workflow to execute, and how.
     * @return The complete stdout and stderr from the workflow execution will be returned.
     */
    private WorkflowResult launchJob(String message, Job job, String seqwareEngine, String seqwareSettingsFile,
            String dockerImage) {
        WorkflowResult workflowResult = null;
        ExecutorService exService = Executors.newFixedThreadPool(2);
        WorkflowRunner workflowRunner = new WorkflowRunner();
        try {

            Path pathToINI = writeINIFile(job);
            resultsChannel.basicPublish(this.resultsQueueName, this.resultsQueueName,
                    MessageProperties.PERSISTENT_TEXT_PLAIN, message.getBytes(StandardCharsets.UTF_8));
            resultsChannel.waitForConfirms();

            //TODO: Parameterize dockerImage
            if (dockerImage == null || dockerImage.trim() == null) {
                dockerImage = "pancancer/seqware_whitestar_pancancer:latest";
            }
            CommandLine cli = new CommandLine("docker");
            cli.addArgument("run");
            List<String> args = new ArrayList<>(
                    Arrays.asList("--rm", "-h", "master", "-t", "-v", "/var/run/docker.sock:/var/run/docker.sock",
                            "-v", job.getWorkflowPath() + ":/workflow", "-v", pathToINI + ":/ini", "-v",
                            "/datastore:/datastore", "-v", "/home/" + this.userName + "/.gnos:/home/ubuntu/.gnos"));
            if (seqwareSettingsFile != null) {
                args.addAll(Arrays.asList("-v", seqwareSettingsFile + ":/home/seqware/.seqware/settings"));
            }
            args.addAll(Arrays.asList(dockerImage, "seqware", "bundle", "launch", "--dir", "/workflow", "--ini",
                    "/ini", "--no-metadata", "--engine", seqwareEngine));

            String[] argsArray = new String[args.size()];
            cli.addArguments(args.toArray(argsArray));

            WorkerHeartbeat heartbeat = new WorkerHeartbeat();
            heartbeat.setQueueName(this.resultsQueueName);
            // channels should not be shared between threads https://www.rabbitmq.com/api-guide.html#channel-threads
            // heartbeat.setReportingChannel(resultsChannel);
            heartbeat.setSettings(settings);
            heartbeat.setSecondsDelay(
                    settings.getDouble(Constants.WORKER_HEARTBEAT_RATE, WorkerHeartbeat.DEFAULT_DELAY));
            heartbeat.setJobUuid(job.getUuid());
            heartbeat.setVmUuid(this.vmUuid);
            heartbeat.setNetworkID(this.networkAddress);
            heartbeat.setStatusSource(workflowRunner);

            long presleep = settings.getLong(Constants.WORKER_PREWORKER_SLEEP, WorkerRunnable.DEFAULT_PRESLEEP);
            long postsleep = settings.getLong(Constants.WORKER_POSTWORKER_SLEEP, WorkerRunnable.DEFAULT_POSTSLEEP);
            long presleepMillis = Base.ONE_SECOND_IN_MILLISECONDS * presleep;
            long postsleepMillis = Base.ONE_SECOND_IN_MILLISECONDS * postsleep;

            workflowRunner.setCli(cli);
            workflowRunner.setPreworkDelay(presleepMillis);
            workflowRunner.setPostworkDelay(postsleepMillis);
            // Submit both
            @SuppressWarnings("unused")
            // We will never actually do submit.get(), because the heartbeat should keep running until it is terminated by
            // exService.shutdownNow().
            Future<?> submit = exService.submit(heartbeat);
            Future<WorkflowResult> workflowResultFuture = exService.submit(workflowRunner);
            // make sure both are complete
            workflowResult = workflowResultFuture.get();
            // don't get the heartbeat if the workflow is complete already

            log.info("Docker execution result: " + workflowResult.getWorkflowStdout());
        } catch (SocketException e) {
            // This comes from trying to get the IP address.
            log.error(e.getMessage(), e);
        } catch (IOException e) {
            // This could be caused by a problem writing the file, or publishing a message to the queue.
            log.error(e.getMessage(), e);
        } catch (ExecutionException e) {
            log.error("Error executing workflow: " + e.getMessage(), e);
        } catch (InterruptedException e) {
            log.error("Workflow may have been interrupted: " + e.getMessage(), e);
        } finally {
            exService.shutdownNow();
        }

        return workflowResult;
    }

    /**
     * Get the IP address of this machine, preference is given to returning an IPv4 address, if there is one.
     *
     * @return An InetAddress object.
     * @throws SocketException
     */
    public InetAddress getFirstNonLoopbackAddress() throws SocketException {
        final String dockerInterfaceName = "docker";
        for (NetworkInterface i : Collections.list(NetworkInterface.getNetworkInterfaces())) {
            if (i.getName().contains(dockerInterfaceName)) {
                // the virtual ip address for the docker mount is useless but is not a loopback address
                continue;
            }
            log.info("Examining " + i.getName());
            for (InetAddress addr : Collections.list(i.getInetAddresses())) {
                if (!addr.isLoopbackAddress()) {
                    // Prefer IP v4
                    if (addr instanceof Inet4Address) {
                        return addr;
                    }
                }

            }
        }
        Log.info("Could not find an ipv4 address");
        for (NetworkInterface i : Collections.list(NetworkInterface.getNetworkInterfaces())) {
            log.info("Examining " + i.getName());
            if (i.getName().contains(dockerInterfaceName)) {
                // the virtual ip address for the docker mount is useless but is not a loopback address
                continue;
            }
            // If we got here it means we never found an IP v4 address, so we'll have to return the IPv6 address.
            for (InetAddress addr : Collections.list(i.getInetAddresses())) {
                // InetAddress addr = (InetAddress) en2.nextElement();
                if (!addr.isLoopbackAddress()) {
                    return addr;
                }
            }
        }
        return null;
    }

    /**
     * Publish a message stating that the job is finished.
     *
     * @param message
     *            - The actual message to publish.
     */
    private void finishJob(String message) {
        log.info("Publishing worker results to results channel " + this.resultsQueueName + ": " + message);
        try {
            boolean success = false;
            do {
                try {
                    resultsChannel.basicPublish(this.resultsQueueName, this.resultsQueueName,
                            MessageProperties.PERSISTENT_TEXT_PLAIN, message.getBytes(StandardCharsets.UTF_8));
                    resultsChannel.waitForConfirms();
                    success = true;
                } catch (AlreadyClosedException e) {
                    // retry indefinitely if the connection is down
                    log.error("could not send closed message, retrying", e);
                    Thread.sleep(Base.ONE_MINUTE_IN_MILLISECONDS);
                }
            } while (!success);

        } catch (IOException | InterruptedException e) {
            log.error(e.toString());
        }
        log.info("Finished job report, let's call it a day");
    }
}