Example usage for org.apache.hadoop.fs Path SEPARATOR

List of usage examples for org.apache.hadoop.fs Path SEPARATOR

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path SEPARATOR.

Prototype

String SEPARATOR

To view the source code for org.apache.hadoop.fs Path SEPARATOR.

Click Source Link

Document

The directory separator, a slash.

Usage

From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java

License:Apache License

/**
 * Return the path to local map output file created earlier
 *
 * @return path/*w  w w  .  ja  v  a  2s  .  co  m*/
 * @throws IOException
 */
@Override
public Path getOutputFile() throws IOException {
    return lDirAlloc.getLocalPathToRead(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR
            + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING, conf);
}

From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java

License:Apache License

/**
 * Create a local map output file name./*w  w  w.ja v  a2s  .co  m*/
 *
 * @param size the size of the file
 * @return path
 * @throws IOException
 */
@Override
public Path getOutputFileForWrite(long size) throws IOException {
    return lDirAlloc.getLocalPathForWrite(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR
            + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING, size, conf);
}

From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java

License:Apache License

/**
 * Create a local map output file name. This should *only* be used if the size
 * of the file is not known. Otherwise use the equivalent which accepts a size
 * parameter.//  ww  w.j  a v a 2s  .c  o m
 * 
 * @return path
 * @throws IOException
 */
@Override
public Path getOutputFileForWrite() throws IOException {
    return lDirAlloc.getLocalPathForWrite(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR
            + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING, conf);
}

From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java

License:Apache License

/**
 * Return the path to a local map output index file created earlier
 *
 * @return path//from  w w w  .  java  2 s .c om
 * @throws IOException
 */
@Override
public Path getOutputIndexFile() throws IOException {
    return lDirAlloc.getLocalPathToRead(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR
            + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING
            + Constants.TEZ_RUNTIME_TASK_OUTPUT_INDEX_SUFFIX_STRING, conf);
}

From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java

License:Apache License

/**
 * Create a local map output index file name.
 *
 * @param size the size of the file/* ww w. j a v  a  2  s  .  co  m*/
 * @return path
 * @throws IOException
 */
@Override
public Path getOutputIndexFileForWrite(long size) throws IOException {
    return lDirAlloc.getLocalPathForWrite(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR
            + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING
            + Constants.TEZ_RUNTIME_TASK_OUTPUT_INDEX_SUFFIX_STRING, size, conf);
}

From source file:org.apache.tez.test.MiniTezCluster.java

License:Apache License

@Override
public void serviceInit(Configuration conf) throws Exception {
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME);
    // Use libs from cluster since no build is available
    conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true);
    // blacklisting disabled to prevent scheduling issues
    conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
    if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
        conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath());
    }/*  www .  j av a  2  s  . c o m*/

    if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) {
        // nothing defined. set quick delete value
        conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l);
    }

    File appJarLocalFile = new File(MiniTezCluster.APPJAR);

    if (!appJarLocalFile.exists()) {
        String message = "TezAppJar " + MiniTezCluster.APPJAR + " not found. Exiting.";
        LOG.info(message);
        throw new TezUncheckedException(message);
    } else {
        LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath());
    }

    FileSystem fs = FileSystem.get(conf);
    Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir"));
    Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar");
    // Copy AppJar and make it public.
    Path appMasterJar = new Path(MiniTezCluster.APPJAR);
    fs.copyFromLocalFile(appMasterJar, appRemoteJar);
    fs.setPermission(appRemoteJar, new FsPermission("777"));

    conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString());
    LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS));

    // VMEM monitoring disabled, PMEM monitoring enabled.
    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);

    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");

    try {
        Path stagingPath = FileContext.getFileContext(conf)
                .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
        /*
         * Re-configure the staging path on Windows if the file system is localFs.
         * We need to use a absolute path that contains the drive letter. The unit
         * test could run on a different drive than the AM. We can run into the
         * issue that job files are localized to the drive where the test runs on,
         * while the AM starts on a different drive and fails to find the job
         * metafiles. Using absolute path can avoid this ambiguity.
         */
        if (Path.WINDOWS) {
            if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
                conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                        new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath());
            }
        }
        FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf);
        if (fc.util().exists(stagingPath)) {
            LOG.info(stagingPath + " exists! deleting...");
            fc.delete(stagingPath, true);
        }
        LOG.info("mkdir: " + stagingPath);
        fc.mkdir(stagingPath, null, true);

        //mkdir done directory as well
        String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
        Path doneDirPath = fc.makeQualified(new Path(doneDir));
        fc.mkdir(doneDirPath, null, true);
    } catch (IOException e) {
        throw new TezUncheckedException("Could not create staging directory. ", e);
    }
    conf.set(MRConfig.MASTER_ADDRESS, "test");

    //configure the shuffle service in NM
    conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
            new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
    conf.setClass(
            String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID),
            ShuffleHandler.class, Service.class);

    // Non-standard shuffle port
    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);

    conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
            ContainerExecutor.class);

    // TestMRJobs is for testing non-uberized operation only; see TestUberAM
    // for corresponding uberized tests.
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    super.serviceInit(conf);
}

From source file:org.apache.tez.tests.MiniTezClusterWithTimeline.java

License:Apache License

@Override
public void serviceInit(Configuration conf) throws Exception {
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME);
    // Use libs from cluster since no build is available
    conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true);
    // blacklisting disabled to prevent scheduling issues
    conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
    if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
        conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath());
    }//ww w. j  ava  2 s.c  o  m

    if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) {
        // nothing defined. set quick delete value
        conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l);
    }

    File appJarLocalFile = new File(MiniTezClusterWithTimeline.APPJAR);

    if (!appJarLocalFile.exists()) {
        String message = "TezAppJar " + MiniTezClusterWithTimeline.APPJAR + " not found. Exiting.";
        LOG.info(message);
        throw new TezUncheckedException(message);
    } else {
        LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath());
    }

    FileSystem fs = FileSystem.get(conf);
    Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir"));
    Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar");
    // Copy AppJar and make it public.
    Path appMasterJar = new Path(MiniTezClusterWithTimeline.APPJAR);
    fs.copyFromLocalFile(appMasterJar, appRemoteJar);
    fs.setPermission(appRemoteJar, new FsPermission("777"));

    conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString());
    LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS));

    // VMEM monitoring disabled, PMEM monitoring enabled.
    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);

    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");

    try {
        Path stagingPath = FileContext.getFileContext(conf)
                .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
        /*
         * Re-configure the staging path on Windows if the file system is localFs.
         * We need to use a absolute path that contains the drive letter. The unit
         * test could run on a different drive than the AM. We can run into the
         * issue that job files are localized to the drive where the test runs on,
         * while the AM starts on a different drive and fails to find the job
         * metafiles. Using absolute path can avoid this ambiguity.
         */
        if (Path.WINDOWS) {
            if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
                conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                        new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath());
            }
        }
        FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf);
        if (fc.util().exists(stagingPath)) {
            LOG.info(stagingPath + " exists! deleting...");
            fc.delete(stagingPath, true);
        }
        LOG.info("mkdir: " + stagingPath);
        fc.mkdir(stagingPath, null, true);

        //mkdir done directory as well
        String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
        Path doneDirPath = fc.makeQualified(new Path(doneDir));
        fc.mkdir(doneDirPath, null, true);
    } catch (IOException e) {
        throw new TezUncheckedException("Could not create staging directory. ", e);
    }
    conf.set(MRConfig.MASTER_ADDRESS, "test");

    //configure the shuffle service in NM
    conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
            new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
    conf.setClass(
            String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID),
            ShuffleHandler.class, Service.class);

    // Non-standard shuffle port
    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);

    conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
            ContainerExecutor.class);

    // TestMRJobs is for testing non-uberized operation only; see TestUberAM
    // for corresponding uberized tests.
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    super.serviceInit(conf);
}

From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java

License:Open Source License

static void parseCommandLineArgs(String[] argv, Configuration conf) {

    CommandLineParser parser = new PosixParser();

    Options options = new Options();

    Option gatkdLocOpt = OptionBuilder.withArgName("depjar_loc").hasArg()
            .withDescription("Complete HDFS path of gatk dependency jar").create("djarloc");
    options.addOption(gatkdLocOpt);// w ww .  j a v  a2  s.  c  o m
    Option bwaLocOpt = OptionBuilder.withArgName("bwa_loc").hasArg()
            .withDescription("Complete HDFS path of bwa binary or bwa.exe file").create("bwaloc");
    options.addOption(bwaLocOpt);
    Option fq1Opt = OptionBuilder.withArgName("fastq_file1").hasArg()
            .withDescription("Complete HDFS path or path relative to user directory for 1st fastq file")
            .create("r1");
    options.addOption(fq1Opt);
    Option fq2Opt = OptionBuilder.withArgName("fastq_file2").hasArg()
            .withDescription("Complete HDFS path or path relative to user directory for 2nd fastq file")
            .create("r2");
    options.addOption(fq2Opt);
    Option bamOpt = OptionBuilder.withArgName("bam_directory").hasArg()
            .withDescription(
                    "Complete HDFS directory path or path relative to user directory for input BAM file")
            .create("b");
    options.addOption(bamOpt);
    Option outOpt = OptionBuilder.withArgName("output_directory").hasArg()
            .withDescription("Complete HDFS path or path relative to user directory for output directory")
            .create("o");
    options.addOption(outOpt);
    Option rSizeOpt = OptionBuilder.withArgName("fastq_read_size").hasArg()
            .withDescription("Number of bytes of a read sequence in input FastQ file").create("rsize");
    options.addOption(rSizeOpt);
    Option rPSplitOpt = OptionBuilder.withArgName("reads_per_map_split").hasArg()
            .withDescription("Optional number of reads to be processed by a mapper").create("reads_per_split");
    options.addOption(rPSplitOpt);
    Option nRedOpt = OptionBuilder.withArgName("number_of_reducers").hasArg()
            .withDescription("Optional number of reducers").create("nred");
    options.addOption(nRedOpt);
    Option nThreadOpt = OptionBuilder.withArgName("number_of_threads").hasArg()
            .withDescription("Optional number of threads").create("nthreads");
    options.addOption(nThreadOpt);
    Option refFileOpt = OptionBuilder.withArgName("path_to_reference_dir").hasArg()
            .withDescription("Complete HDFS path of reference directory").create("ref");
    options.addOption(refFileOpt);
    Option kSiteFileOpt = OptionBuilder.withArgName("path_to_knownsites_dir").hasArg()
            .withDescription("Complete HDFS path of known-sites db directory").create("dbfile");
    options.addOption(kSiteFileOpt);

    Option platformOpt = OptionBuilder.withArgName("Linux/Windows").hasArg()
            .withDescription("Platform to run on").create("p");
    options.addOption(platformOpt);

    Option noAlignOpt = new Option("na", "noalign", false, "Don't run Alignment stage");
    options.addOption(noAlignOpt);

    Option noReAlignOpt = new Option("nra", "norealign", false, "Do not run Local Realignment stage");
    options.addOption(noReAlignOpt);

    Option noMarkDupOpt = new Option("nmd", "nomarkdup", false, "Do not run Mark Duplicates stage");
    options.addOption(noMarkDupOpt);

    Option noQRecabOpt = new Option("nqr", "noqrecab", false, "Do not run Quality Recalibration stage");
    options.addOption(noQRecabOpt);

    Option noVarOpt = new Option("nv", "novariant", false, "Do not run Structural Variant stage");
    options.addOption(noVarOpt);

    Option noFVarOpt = new Option("nfv", "nofvariant", false, "Do not run Filter Variant stage");
    options.addOption(noFVarOpt);

    Option noMerOpt = new Option("nm", "nomresults", false, "Do not Merge Results");
    options.addOption(noMerOpt);

    Option isXVariantOpt = new Option("xv", "xvariant", false,
            "enable flag, if variant calling should be done independently for INDELs and SNPs");
    options.addOption(isXVariantOpt);

    try {
        // parse the command line arguments
        String[] args = new GenericOptionsParser(conf, options, argv).getRemainingArgs();
        CommandLine line = parser.parse(options, args);

        if (line.hasOption(noAlignOpt.getOpt()))
            noalign = true;
        if (line.hasOption(noReAlignOpt.getOpt()))
            norealign = true;
        if (line.hasOption(noMarkDupOpt.getOpt()))
            nomarkdup = true;
        if (line.hasOption(noQRecabOpt.getOpt()))
            noqrecab = true;
        if (line.hasOption(noVarOpt.getOpt()))
            novariant = true;
        if (line.hasOption(noFVarOpt.getOpt()))
            nofvariant = true;
        if (line.hasOption(noMerOpt.getOpt()))
            nomresults = true;

        if (line.hasOption(fq1Opt.getOpt()) && line.hasOption(bamOpt.getOpt())) {
            throw new ParseException(
                    "Invalid Usage: fastq file and BAM file cannot be given together as input");
        }
        if (line.hasOption(fq2Opt.getOpt()) && !line.hasOption(fq1Opt.getOpt())) {
            throw new ParseException("Invalid Usage: fastq file2 is invalid without fastq file1");
        }
        if (!line.hasOption(fq2Opt.getOpt()) && !line.hasOption(fq1Opt.getOpt())
                && !line.hasOption(bamOpt.getOpt())) {
            throw new ParseException(
                    "Invalid Usage: Either the fastq file or BAM file has to be provided as input");
        }
        if (line.hasOption(gatkdLocOpt.getOpt())) {
            gatk_binary_loc = line.getOptionValue(gatkdLocOpt.getOpt());
            validatePath(gatk_binary_loc, conf);
        } else {
            throw new ParseException(
                    "Invalid Usage: GATK dependency jar location (-djarloc) is mandatory for running the pipeline");
        }

        if (!noalign) {
            if (line.hasOption(fq1Opt.getOpt())) {
                readFile1 = line.getOptionValue(fq1Opt.getOpt());
                validatePath(readFile1, conf);
                fqInput = (new Path(readFile1).getParent()).toString();
            }
            if (line.hasOption(fq2Opt.getOpt())) {
                readFile2 = line.getOptionValue(fq2Opt.getOpt());
                conf.setBoolean("gatk.hadoop.pairedend", true);
                validatePath(readFile2, conf);
                conf.set("gatk.hadoop.readfile2", readFile2);
                ;
            }
            if (line.hasOption(rSizeOpt.getOpt())) {
                fq_read_size = Integer.parseInt(line.getOptionValue(rSizeOpt.getOpt()));
            } else {
                throw new ParseException("Invalid Usage: read size (-rsize) is mandatory for Alignment");
            }
            if (line.hasOption(bwaLocOpt.getOpt())) {
                bwa_binary_loc = line.getOptionValue(bwaLocOpt.getOpt());
                validatePath(bwa_binary_loc, conf);
            } else {
                throw new ParseException(
                        "Invalid Usage: bwa binary/exe location (-bwaloc) is mandatory for Alignment");
            }
            if (line.hasOption(rPSplitOpt.getOpt())) {
                reads_per_split = Integer.parseInt(line.getOptionValue(rPSplitOpt.getOpt()));
            }
        }
        if (line.hasOption(nRedOpt.getOpt())) {
            nReducers = Integer.parseInt(line.getOptionValue(nRedOpt.getOpt()));
        }
        if (line.hasOption(nThreadOpt.getOpt())) {
            nThreads = Integer.parseInt(line.getOptionValue(nThreadOpt.getOpt()));
            conf.setInt("gatk.hadoop.nthreads", nThreads);
        }
        if (line.hasOption(bamOpt.getOpt())) {
            int rcount = 0;
            BAMInputPath = line.getOptionValue(bamOpt.getOpt());
            validatePath(BAMInputPath, conf);
            Path BAMPath = new Path(BAMInputPath);
            FileSystem fs = BAMPath.getFileSystem(conf);
            FileStatus[] content = fs.listStatus(BAMPath);
            for (int i = 0; i < content.length; i++) {
                String filename = content[i].getPath().getName();
                if (filename.endsWith(".bam")) {
                    String prefix = filename.substring(0, 6);
                    try {
                        Long value = Long.valueOf(prefix);
                    } catch (NumberFormatException e) {
                        String tmpFile = BAMInputPath + Path.SEPARATOR + String.format("%06d", rcount) + "-"
                                + filename;
                        boolean rename = fs.rename(content[i].getPath(), new Path(tmpFile));
                    }
                    rcount++;
                }
            }
        }
        if (line.hasOption(outOpt.getOpt())) {
            outputDir = line.getOptionValue(outOpt.getOpt());
            if (!(new Path(outputDir).getFileSystem(conf).mkdirs(new Path(outputDir)))) {
                throw new Exception("MKDIR failure");
            }
            if (!noalign) {
                BWAOutPath = outputDir + Path.SEPARATOR + "AlignerOut";
                SortBWAOutPath = outputDir + Path.SEPARATOR + "SortedAlignerOut";
                BAMInputPath = outputDir + Path.SEPARATOR + "BAMInput";
            }
            IndelOutPath = outputDir + Path.SEPARATOR + "IndelRealignOut";
            RmdupOutPath = outputDir + Path.SEPARATOR + "DedupOut";
            RecalOutPath = outputDir + Path.SEPARATOR + "RecalibrationOut";
            FinalBAMPath = outputDir + Path.SEPARATOR + "FinalBAMOut";
        } else {
            throw new ParseException("Invalid Usage: output directory is mandatory");
        }
        if (line.hasOption(refFileOpt.getOpt())) {
            Path refFileDir = new Path(line.getOptionValue(refFileOpt.getOpt()));
            FileSystem fs = refFileDir.getFileSystem(conf);
            FileStatus[] content = fs.listStatus(refFileDir);
            for (int i = 0; i < content.length; i++) {
                if ((content[i].getPath().getName()).endsWith(".fa")
                        || (content[i].getPath().getName()).endsWith(".fasta")) {
                    refFileLoc = content[i].getPath().toString();
                }
            }
            validatePath(refFileLoc, conf);
            refFileName = refFileLoc.substring(0, refFileLoc.lastIndexOf("."));
        } else {
            throw new ParseException("Invalid Usage: reference fasta file is mandatory");
        }
        if (line.hasOption(kSiteFileOpt.getOpt())) {
            Path knownSitesDir = new Path(line.getOptionValue(kSiteFileOpt.getOpt()));
            FileSystem fs = knownSitesDir.getFileSystem(conf);
            FileStatus[] content = fs.listStatus(knownSitesDir);
            for (int i = 0; i < content.length; i++) {
                if ((content[i].getPath().getName()).endsWith(".vcf")) {
                    knownSitesLoc = content[i].getPath().toString();
                }
            }
            validatePath(knownSitesLoc, conf);
        }
        if (line.hasOption(platformOpt.getOpt())) {
            platform = line.getOptionValue(platformOpt.getOpt());
            if (platform.equalsIgnoreCase("Linux")) {
                is_azure = false;
                conf.setBoolean("gatk.hadoop.isazure", false);
            }
        }
        if (line.hasOption(isXVariantOpt.getOpt())) {
            xVariantCall = true;
        }
    } catch (ParseException exp) {
        System.out.println(exp.getMessage());
        if (printUsage) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("hadoop jar {/local/path/to/SeqInCloud.jar} {options}", options);
        }
        System.exit(-1);
    } catch (Exception exp) {
        System.out.println("Command line parsing error: " + exp.getMessage());
        System.exit(-1);
    }
}

From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java

License:Open Source License

@Override
public int run(String[] argv) throws Exception {
    try {/*from ww  w . j a va2 s. co m*/
        Configuration conf;
        FileSystem srcFs, outFs, fs;
        Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath;
        int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100;
        FileStatus[] content;
        ClusterStatus status;
        int numNodes, mapSlotsPerNode;
        long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime,
                splitSize;
        float inputBufpcnt;
        FSDataOutputStream out;
        FSDataInputStream in;
        SAMFileReader fileReader;
        InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler;
        double sampling_frequency = 0.01;

        // Job object can be used for Aligner job if enabled
        conf = getConf();
        Job job = new Job(conf);

        parseCommandLineArgs(argv, conf);

        maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks();

        maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks();
        if (!noalign) {
            System.out.println("Starting Alignment Job");
            startTime = System.currentTimeMillis();

            status = new JobClient(new JobConf(conf)).getClusterStatus();
            numNodes = status.getTaskTrackers();
            // Job specific setting of number of Reducers..
            if (nReducers == 0)
                nReducers = numNodes;
            conf.setInt("mapred.reduce.tasks", nReducers);

            Path refPath = new Path(refFileLoc);
            fs = refPath.getFileSystem(conf);
            blockSize = fs.getFileStatus(refPath).getBlockSize();
            splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize());

            if (reads_per_split == 0) {
                inputPath = new Path(readFile1);
                long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen();
                long numSplits = Math.round(readSize / splitSize);

                if (numSplits < maxMapTasks)
                    numSplits = maxMapTasks;

                if (numSplits < nReducers)
                    numSplits = nReducers;

                long numReads = Math.round(readSize / (long) fq_read_size);
                reads_per_split = numReads / numSplits;

                // Total Order Partitioner
                if ((double) reads_per_split <= (1 / sampling_frequency)) {
                    sampling_frequency = 1;
                    granularity = 1;
                } else if (((double) reads_per_split > (1 / sampling_frequency))
                        && ((double) reads_per_split <= (1 / sampling_frequency * 100))) {
                    sampling_frequency = 0.1;
                    granularity = 10;
                }
            }

            job.setJarByClass(GATKJobClient.class);
            job.setInputFormatClass(NLineXInputFormat.class);
            FileInputFormat.addInputPath(job, new Path(fqInput));
            FileOutputFormat.setOutputPath(job, new Path(BWAOutPath));

            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration());
            if (!is_azure) {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"),
                        job.getConfiguration());
            } else {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"),
                        job.getConfiguration());
            }
            DistributedCache.createSymlink(job.getConfiguration());

            // Setting local.cache.size - Add up the size of the files
            // distributed through the cache

            cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen();
            if (!is_azure) {
                cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen();
            }

            if (cacheSize > 8 * 1024 * 1024 * 1024) {
                conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024));
            }

            conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs..
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setPartitionerClass(BWAPartitioner.class);
            job.setReducerClass(BWAReducer.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            if (job.waitForCompletion(true)) {
                System.out.println("BWA Alignment done");
            }

            content = fs.listStatus(new Path(BWAOutPath));

            for (int i = 0; i < content.length; i++) {
                if (!((content[i].getPath().getName()).endsWith(".bam"))
                        && !((content[i].getPath().getName()).startsWith("_"))) {
                    fs.delete(content[i].getPath(), false);
                }
            }
            endTime = System.currentTimeMillis();
            System.out.println("BWA Alignment took: " + (endTime - startTime));
            startTime = System.currentTimeMillis();
            System.out.println("Starting Splitting BAM Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1");
            FileOutputFormat.setOutputPath(job, output);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setInt("gatk.hadoop.granularity", granularity);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.isindex", false);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setMapOutputKeyClass(NullWritable.class);
            job.setMapOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("SplittingBAM Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Splitting BAM Indexing took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Sort Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            if (norealign && nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath));
            job.setInputFormatClass(ContigInputFormat.class);
            job.setPartitionerClass(ContigPartitioner.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            fs = inputPath.getFileSystem(conf);
            content = fs.listStatus(inputPath);
            for (int i = 0; i < content.length; i++) {
                if (content[i].getPath().getName().endsWith(".bam")) {
                    in = fs.open(content[i].getPath());
                    List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader())
                            .getSequenceDictionary().getSequences();
                    conf.setInt("mapred.reduce.tasks", sequences.size());

                    break;
                }
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            //conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);

            if (job.waitForCompletion(true)) {
                System.out.println("Sort completed successfully");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Sort job took: " + (endTime - startTime));
        }

        if (!norealign) {
            if (!noalign)
                BAMInputPath = SortBWAOutPath;

            startTime = System.currentTimeMillis();
            System.out.println("Starting Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);
            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2");
            FileOutputFormat.setOutputPath(job, output);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setBoolean("gatk.hadoop.isindex", true);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Indexing job took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Realigner Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);

            job.setInputFormatClass(BAMInputFormat.class);

            srcFs = new Path(outputDir).getFileSystem(conf);
            if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition")))
                System.out.println("mkdir failed");
            inputDir = new Path(outputDir + Path.SEPARATOR + "Partition");
            inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
            partition = new Path(inputDir, "_partition");
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(conf, partition);

            try {
                URI partitionURI = new URI(partition.toString() + "#_partition");
                DistributedCache.addCacheFile(partitionURI, conf);
            } catch (URISyntaxException e) {
                assert false;
            }

            if (nReducers == 0) {
                if (!nomarkdup || !noqrecab || !novariant) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10));
                }
            } else {
                conf.setInt("mapred.reduce.tasks", nReducers);
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            if (nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(IndelMapper.class);
            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);
            FileOutputFormat.setOutputPath(job, new Path(IndelOutPath));

            sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                    max_splits);
            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
            job.setInputFormatClass(LociInputFormat.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());
            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Indel realignment done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Indel Realigner took: " + (endTime - startTime));
        }

        if (!nomarkdup || !noqrecab || !novariant) {
            /* 
             * MarkDuplicate and Indexing Job 
             * FixMateInformation is not required as it is handled
             * automatically by GATK after IndelRealignment.
             */
            System.out.println("Starting MarkDup/Indexing job");
            startTime = System.currentTimeMillis();
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            if (!nomarkdup) {
                System.out.println("Starting MarkDuplicates job");
                conf.setBoolean("gatk.hadoop.ismarkdup", true);
                FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath));
            }
            if (!noqrecab || !novariant) {
                conf.setBoolean("gatk.hadoop.issindex", true);
                conf.setBoolean("gatk.hadoop.isindex", true);
                if (nomarkdup) {
                    System.out.println("Starting Indexing job");
                    FileOutputFormat.setOutputPath(job,
                            new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"));
                }
            }
            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Markdup/Indexing job done !!!");
            }
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }

            if (!nomarkdup) {
                Path rmdupOutPath = new Path(RmdupOutPath);
                fs = rmdupOutPath.getFileSystem(conf);
                content = fs.listStatus(rmdupOutPath);

                for (int i = 0; i < content.length; i++) {
                    if ((content[i].getPath().getName()).startsWith("part")) {
                        fs.delete(content[i].getPath(), false);
                    }
                }
                endTime = System.currentTimeMillis();
                System.out.println("MarkDuplicates took: " + (endTime - startTime));
            } else {
                endTime = System.currentTimeMillis();
                System.out.println("Indexing took: " + (endTime - startTime));
            }
        }

        if (!noqrecab) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Recal - Count Covariates Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(LociInputFormat.class);

            conf.setLong("local.cache.size", 20106127360L);
            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.set("gatk.hadoop.outputpath", outputDir);
            // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1);
            // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(RecalCovMapper.class);
            job.setCombinerClass(RecalCovCombiner.class);
            job.setReducerClass(RecalCovReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"),
                    job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("CountCovariates done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("CountCovariates took: " + (endTime - startTime));
        }

        if (!noqrecab || !novariant) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Table Recalibration / Unified Genotyper Job");
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            FileInputFormat.addInputPath(job, inputPath);

            if (!noqrecab) {
                conf.setBoolean("gatk.hadoop.recab", true);
                if (norealign) {
                    job.setInputFormatClass(BAMInputFormat.class);
                    srcFs = new Path(outputDir).getFileSystem(conf);
                    if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition")))
                        System.out.println("mkdir failed");
                } else {
                    job.setInputFormatClass(LociInputFormat.class);
                }
                inputDir = new Path(outputDir + "/" + "Partition");
                inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
                partition = new Path(inputDir, "_partition");
                job.setPartitionerClass(TotalOrderPartitioner.class);
                TotalOrderPartitioner.setPartitionFile(conf, partition);
                try {
                    URI partitionURI = new URI(partition.toString() + "#_partition");
                    DistributedCache.addCacheFile(partitionURI, conf);
                } catch (URISyntaxException e) {
                    assert false;
                }

                if (nReducers == 0) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", nReducers);
                }
                conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
                if (!nomresults)
                    conf.setBoolean("gatk.hadoop.ismerge", true);
                job.setReducerClass(SortReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(SAMRecordWritable.class);
                job.setOutputFormatClass(SortOutputFormat.class);
                FileOutputFormat.setOutputPath(job, new Path(RecalOutPath));
            } else {
                job.setInputFormatClass(LociInputFormat.class);
                conf.setInt("mapred.reduce.tasks", 0);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"));
            }

            job.setMapperClass(RecalMapper.class);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);

            conf.set("gatk.hadoop.outputpath", outputDir);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            if (!novariant) {
                conf.setBoolean("gatk.hadoop.variant", true);
                if (!nofvariant)
                    conf.setBoolean("gatk.hadoop.fvariant", true);
                conf.setInt("gatk.hadoop.nthreads", nThreads);
                conf.setBoolean("gatk.hadoop.xvariant", xVariantCall);
            }

            if (!noqrecab && norealign) {
                sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                        max_splits);
                InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
                job.setInputFormatClass(LociInputFormat.class);
            }

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("TableRecalibration Job done !!");
            }
            endTime = System.currentTimeMillis();
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }
            System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime));
        }
        if (!novariant && !nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Merge Variant Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut");
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setReducerClass(VariantReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Merge Variants done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("MergeVariant job took: " + (endTime - startTime));

            if (xVariantCall && !novariant && !nomresults) {
                startTime = System.currentTimeMillis();

                System.out.println("Merge INDEL Variant Job");
                job = new Job();
                job.setJarByClass(GATKJobClient.class);
                conf = job.getConfiguration();
                inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut");
                FileInputFormat.addInputPath(job, inputPath);
                job.setInputFormatClass(WholeFileInputFormat.class);

                conf.setInt("mapred.reduce.tasks", 1);
                conf.setLong("mapred.task.timeout", 86400000L);
                conf.setBoolean("mapred.map.tasks.speculative.execution", false);
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

                conf.setBoolean("gatk.hadoop", true);
                conf.setBoolean("gatk.hadoop.isazure", is_azure);
                job.setReducerClass(VariantReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setMapOutputValueClass(Text.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(NullWritable.class);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut"));

                DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
                // Standard inputs
                DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"),
                        job.getConfiguration());

                DistributedCache.createSymlink(job.getConfiguration());

                if (job.waitForCompletion(true)) {
                    System.out.println("Merge INDEL Variants done");
                }
                endTime = System.currentTimeMillis();
                System.out.println("MergeINDELVariant job took: " + (endTime - startTime));
            }
        }

        if (!nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Merge BAM Job");

            outputPath = new Path(FinalBAMPath);
            outFs = outputPath.getFileSystem(conf);

            if (!outFs.mkdirs(outputPath))
                System.out.println("mkdir failed");
            // Currently no support to merge output from MarkDuplicates 
            // from Job Client. Need to have a separate MR job for it.
            if (!noqrecab)
                inputPath = new Path(RecalOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else if (!nomarkdup)
                throw new Exception("Merge not implemented MarkDuplicates output.");
            else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant)
                inputPath = new Path(BAMInputPath);

            fs = inputPath.getFileSystem(conf);

            content = fs.listStatus(inputPath);
            mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam");

            Path p = null;
            int nfiles = 0;
            for (int i = 0; i < content.length; i++) {
                p = content[i].getPath();
                ++nfiles;
            }

            if (nfiles == 1) {
                boolean rename = fs.rename(p, mergeOutFile);
            } else {
                out = outFs.create(mergeOutFile, true);

                for (int i = 0; i < content.length; i++) {
                    p = content[i].getPath();
                    if ((p.getName()).endsWith(".bam")) {
                        in = fs.open(p);
                        IOUtils.copyBytes(in, out, conf, false);
                        in.close();
                    }
                }

                out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
                out.close();
            }

            endTime = System.currentTimeMillis();
            System.out.println("Final Merge took: " + (endTime - startTime));
        }
        System.out.println("JobCompleted");
    } catch (IOException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (InterruptedException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (ClassNotFoundException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (Exception e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    }
    return 0;
}

From source file:org.commoncrawl.mapred.ec2.parser.OutputCommitter.java

License:Open Source License

Path getTempTaskOutputPath(TaskAttemptContext taskContext) {
    JobConf conf = taskContext.getJobConf();
    Path outputPath = FileOutputFormat.getOutputPath(conf);
    if (outputPath != null) {
        Path p = new Path(outputPath, (FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR + "_"
                + taskContext.getTaskAttemptID().toString()));
        try {/*  ww w . j a  v  a  2 s .c  o  m*/
            FileSystem fs = p.getFileSystem(conf);
            return p.makeQualified(fs);
        } catch (IOException ie) {
            LOG.warn(StringUtils.stringifyException(ie));
            return p;
        }
    }
    return null;
}