Example usage for org.apache.hadoop.mapred.jobcontrol Job getDependingJobs

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred.jobcontrol Job getDependingJobs.

Prototype

public ArrayList<Job> getDependingJobs()

Source Link

Usage

From source file:com.ebay.erl.mobius.core.MobiusJob.java

License:Apache License

/**
 * Add a job, represented by the <code>aNewJob</code> object, into the execution queue.
 * <p>/*from   w w w  .  j a  v  a2 s .com*/
 * 
 * Users can use this method to add one or more jobs' configuration into the job queue, and Mobius engine
 * will analyze the <code>aNewJob</code> objects within the queue to understand the dependence of jobs.  
 * For example, if job B's input is from job A, then job B won't be submitted until A is completed 
 * successfully.  If A failed, the B will not be submitted.
 * <p>
 *  
 * 
 * @param aNewJobConf a {@link Configuration} object represents a Hadoop job. 
 * @throws IOException
 */
protected void addToExecQueue(Configuration aNewJobConf) throws IOException {
    // Add the new job into execution engine and realize
    // its dependency, if any.
    //
    // To realize the job dependency, we need to analyze the input
    // path of this new job.
    // 
    // The inputs of a job could be:
    // 1) if aNewJob is not a derived job (ex: result of another MR job), 
    // then the inputs of the job can be retrieved from "mapred.input.dir",
    // or from {@link MultipleInputs} (ex, joining different type of dataset)/
    // 2) if aNewJob is a derived job, the input  is from the output of previous
    // MR job.

    String inputFolders = aNewJobConf.get("mapred.input.dir", "");
    if (inputFolders.length() == 0) {
        // the value of "mapred.input.dir" is empty, assuming the inputs of this job 
        // are coming from {@link MultipleInputs}.

        String multipleInputs = aNewJobConf
                .get("mapred.input.dir.mappers"/* for using old MultipleInputs, v0.20.X */, aNewJobConf.get(
                        "mapreduce.input.multipleinputs.dir.formats"/* for new MultipleInputs, v0.23.X */, ""));

        if (multipleInputs.length() > 0) {
            // the input paths of this job is coming from MultipleInputs, extract the input paths.
            // The format from {@link MultipleInputs} is like: hadoop_path1;corresponding_mapper1,hadoop_path2;corresponding_mapper2...
            String[] pathAndMapperPairs = multipleInputs.split(",");
            for (String aPair : pathAndMapperPairs) {
                String[] pathToMapper = aPair.split(";");
                String path = pathToMapper[0];
                String mapper = pathToMapper[1];

                if (inputFolders.length() == 0) {
                    inputFolders = getPathOnly(path);
                } else {
                    inputFolders = inputFolders + "," + getPathOnly(path);
                }
            }
        } else {
            throw new IllegalArgumentException("Cannot find input path(s) of job: ["
                    + aNewJobConf.get("mapred.job.name") + "] from the following attributes: "
                    + "mapred.input.dir, mapred.input.dir.mappers, nor mapreduce.input.multipleinputs.dir.formats. "
                    + "Please specify the input path(s) of this job.");
        }
    } else {
        // the input path of this job is specified in mapred.input.dir
        inputFolders = getPathOnly(inputFolders);
    }

    ////////////////////////////////////////////////////////////
    // validate output path of this job, to ensure it doesn't
    // use the same folder of another job's output.
    ////////////////////////////////////////////////////////////
    String outputPath = aNewJobConf.get("mapred.output.dir", "");
    if (outputPath.isEmpty())
        throw new IllegalStateException(
                "Please specify the output directory of job:" + aNewJobConf.get("mapred.job.name"));

    if (this.isOutputOfAnotherJob(outputPath)) {
        throw new IllegalArgumentException("Job [" + aNewJobConf.get("mapred.job.name") + "]'s output ["
                + outputPath + "] is " + "the output of job[" + jobTopology.get(outputPath).getJobName() + "], "
                + "please make sure to use different output folder for each job.");
    }

    //////////////////////////////////////////////////////////////////
    // pass all the validation, start to build the dependencies.
    //////////////////////////////////////////////////////////////////
    Job newJob = new ConfigurableJob(new JobConf(aNewJobConf, this.getClass()));

    newJob.setJobName(aNewJobConf.get("mapred.job.name", aNewJobConf.get("mapreduce.job.name", "Mobius Job")));
    for (String anInputOfNewJob : inputFolders.split(",")) {
        // Added to track inputs for local PC sampling
        inputPaths.add(anInputOfNewJob);

        Job dependsOn = jobTopology.get(this.getFS().makeQualified(new Path(anInputOfNewJob)).toUri());
        if (dependsOn != null) {
            List<Job> dependingJobs = newJob.getDependingJobs();

            boolean alreadyInDependency = dependingJobs != null && dependingJobs.contains(dependsOn);
            if (alreadyInDependency) {
                // already added, do nothing.
            } else {
                LOGGER.info(newJob.getJobName() + " depends on " + dependsOn.getJobName());
                newJob.addDependingJob(dependsOn);
            }
        }
    }

    // put the output of this <code>newJob</code> into job topology
    // so that later if a job read this <code>newJob</code>'s output
    // as its input, then the system can detect the dependency.

    URI outputPathURI = this.getFS().makeQualified(new Path(outputPath)).toUri();
    LOGGER.info("Adding Job:" + newJob.getJobName() + "\tOutput:[" + outputPath.toString() + "]");
    jobTopology.put(outputPathURI, newJob);
}