com.addthis.hydra.task.pipeline.PipelineTask.java Source code

Java tutorial

Introduction

Here is the source code for com.addthis.hydra.task.pipeline.PipelineTask.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.task.pipeline;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.io.IOException;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;

import java.nio.file.Path;

import com.addthis.hydra.task.map.StreamMapper;
import com.addthis.hydra.task.run.TaskRunnable;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * <p>This is a <span class="hydra-summary">pipeline of one or more hydra jobs</span>.
 * It is specified with {@code type : "pipeline"}.</p>
 * <p>A pipeline job will run the first job phase to completion, then the second
 * phase, then the third phase, etc. When a pipeline job stops and is restarted
 * it begins processing from the first phase. It is recommended that phases use
 * the traditional mechanisms for processing data at most once, ie. the marks
 * directories.</p>
 * <p>Use the notation {@code ${hydra.task.jobid}} to retrieve the identifier
 * of the currently running job. This should be used by downstream phases to read
 * the files produced by an upstream phase.</p>
 * <p>By default error checking is enabled to verify that each job phase
 * does not write to an output directory of another job phase. If you want
 * to disable this error checking then set {@code validateDirs} to false.</p>
 * <p>All logging information is printed out using human (counting from 1)
 * numbering of the phases.</p>
 * <p>Example:</p>
 * <pre>
 *  {pipeline.phases:[]}
 * </pre>
 *
 * @user-reference
 * @hydra-name pipeline
 */
public class PipelineTask implements TaskRunnable {

    private static final Logger log = LoggerFactory.getLogger(PipelineTask.class);

    @Nonnull
    private final StreamMapper[] phases;

    @Nullable
    private final boolean[] disable;

    /**
     * If true then ensure that writable directories are all unique.
     **/
    private final boolean validateDirs;

    private final ImmutableList<CompletableFuture<Void>> phaseComplete;

    private final ImmutableList<CompletableFuture<Void>> phaseNext;

    private volatile StreamMapper currentPhase = null;

    @JsonCreator
    public PipelineTask(@JsonProperty("phases") @Nonnull StreamMapper[] phases,
            @JsonProperty("disable") boolean[] disable, @JsonProperty("validateDirs") boolean validateDirs) {
        this.phases = phases;
        this.validateDirs = validateDirs;
        this.disable = disable;
        if ((disable != null) && (disable.length != phases.length)) {
            throw new IllegalStateException("disable array is not of equal length as phases array");
        }
        int futures = Math.max(phases.length - 1, 0);
        ImmutableList.Builder<CompletableFuture<Void>> complete = new ImmutableList.Builder<>();
        ImmutableList.Builder<CompletableFuture<Void>> next = new ImmutableList.Builder<>();
        for (int i = 0; i < futures; i++) {
            final int current = i;
            CompletableFuture<Void> phaseCompleteFuture = phases[i].getCompletionFuture();
            CompletableFuture<Void> phaseNextFuture = phaseCompleteFuture.thenRun(() -> beginPhase(current + 1));
            complete.add(phaseCompleteFuture);
            next.add(phaseNextFuture);
        }
        this.phaseComplete = complete.build();
        this.phaseNext = next.build();
        validateWritableRootPaths();
    }

    @Override
    public void start() {
        beginPhase(0);
    }

    @Override
    public void close() throws Exception {
        log.info("Pipeline task is starting shutdown");
        int size = phaseComplete.size();
        boolean cancel[] = new boolean[size];
        for (int i = (size - 1); i >= 0; i--) {
            cancel[i] = phaseComplete.get(i).cancel(false);
        }
        for (int i = (size - 1); i >= 0; i--) {
            try {
                if (!cancel[i]) {
                    phaseNext.get(i).join();
                }
            } catch (CompletionException ex) {
                String msg = "Phase " + (i + 1) + " phaseNext future encountered an "
                        + "exception while starting phase " + (i + 2);
                throw new IOException(msg, ex);
            } catch (CancellationException ex) {
                String msg = "Race condition: Phase " + (i + 1) + " phaseNext "
                        + "future was cancelled by another thread";
                throw new IOException(msg, ex);
            }
        }
        /**
         * At this point all phaseNext futures have either
         * completed or have been cancelled. It is now
         * safe to close the current phase.
         */
        if (currentPhase != null) {
            currentPhase.close();
        }
    }

    /**
     * Begin a phase. Clears the {@code currentPhase} before initializing
     * the new phase and assigns {@code currentPhase} on successful
     * initialization.
     */
    private void beginPhase(int pos) {
        if (pos >= phases.length) {
            return;
        }
        if ((disable != null) && (disable[pos])) {
            log.info("Skipping phase {} because it is disabled.", pos + 1);
            beginPhase(pos + 1);
        } else {
            log.info("Initializing phase {} for execution.", pos + 1);
            currentPhase = null;
            phases[pos].start();
            currentPhase = phases[pos];
        }
    }

    @Nonnull
    @Override
    public ImmutableList<Path> writableRootPaths() {
        ImmutableList.Builder<Path> builder = new ImmutableList.Builder<>();
        for (int i = 0; i < phases.length; i++) {
            if ((disable != null) && disable[i]) {
                continue;
            }
            builder.addAll(phases[i].writableRootPaths());
        }
        return builder.build();
    }

    public void validateWritableRootPaths() {
        if (!validateDirs) {
            return;
        }
        for (StreamMapper phase : phases) {
            phase.validateWritableRootPaths();
        }
        Set<Path>[] outputDirs = new Set[phases.length];
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < phases.length; i++) {
            if ((disable != null) && disable[i]) {
                continue;
            }
            outputDirs[i] = new HashSet<>();
            outputDirs[i].addAll(phases[i].writableRootPaths());
            for (int j = 0; j < i; j++) {
                Sets.SetView<Path> intersect = Sets.intersection(outputDirs[i], outputDirs[j]);
                if (intersect.size() > 0) {
                    String message = String.format("Phases %d and %d have overlapping output directories: \"%s\"\n",
                            (j + 1), (i + 1), intersect.toString());
                    builder.append(message);
                }
            }
        }
        if (builder.length() > 0) {
            throw new IllegalArgumentException(builder.toString());
        }
    }

}