List of usage examples for org.apache.hadoop.fs Path SEPARATOR
String SEPARATOR
To view the source code for org.apache.hadoop.fs Path SEPARATOR.
Click Source Link
From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java
License:Apache License
/** * Return the path to local map output file created earlier * * @return path/*w w w . ja v a 2s . co m*/ * @throws IOException */ @Override public Path getOutputFile() throws IOException { return lDirAlloc.getLocalPathToRead(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING, conf); }
From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java
License:Apache License
/** * Create a local map output file name./*w w w.ja v a2s .co m*/ * * @param size the size of the file * @return path * @throws IOException */ @Override public Path getOutputFileForWrite(long size) throws IOException { return lDirAlloc.getLocalPathForWrite(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING, size, conf); }
From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java
License:Apache License
/** * Create a local map output file name. This should *only* be used if the size * of the file is not known. Otherwise use the equivalent which accepts a size * parameter.// ww w.j a v a 2s .c o m * * @return path * @throws IOException */ @Override public Path getOutputFileForWrite() throws IOException { return lDirAlloc.getLocalPathForWrite(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING, conf); }
From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java
License:Apache License
/** * Return the path to a local map output index file created earlier * * @return path//from w w w . java 2 s .c om * @throws IOException */ @Override public Path getOutputIndexFile() throws IOException { return lDirAlloc.getLocalPathToRead(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING + Constants.TEZ_RUNTIME_TASK_OUTPUT_INDEX_SUFFIX_STRING, conf); }
From source file:org.apache.tez.runtime.library.common.task.local.output.TezLocalTaskOutputFiles.java
License:Apache License
/** * Create a local map output index file name. * * @param size the size of the file/* ww w. j a v a 2 s . co m*/ * @return path * @throws IOException */ @Override public Path getOutputIndexFileForWrite(long size) throws IOException { return lDirAlloc.getLocalPathForWrite(Constants.TEZ_RUNTIME_TASK_OUTPUT_DIR + Path.SEPARATOR + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING + Constants.TEZ_RUNTIME_TASK_OUTPUT_INDEX_SUFFIX_STRING, size, conf); }
From source file:org.apache.tez.test.MiniTezCluster.java
License:Apache License
@Override public void serviceInit(Configuration conf) throws Exception { conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME); // Use libs from cluster since no build is available conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true); // blacklisting disabled to prevent scheduling issues conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath()); }/* www . j av a 2 s . c o m*/ if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) { // nothing defined. set quick delete value conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l); } File appJarLocalFile = new File(MiniTezCluster.APPJAR); if (!appJarLocalFile.exists()) { String message = "TezAppJar " + MiniTezCluster.APPJAR + " not found. Exiting."; LOG.info(message); throw new TezUncheckedException(message); } else { LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath()); } FileSystem fs = FileSystem.get(conf); Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir")); Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar"); // Copy AppJar and make it public. Path appMasterJar = new Path(MiniTezCluster.APPJAR); fs.copyFromLocalFile(appMasterJar, appRemoteJar); fs.setPermission(appRemoteJar, new FsPermission("777")); conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString()); LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS)); // VMEM monitoring disabled, PMEM monitoring enabled. conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000"); try { Path stagingPath = FileContext.getFileContext(conf) .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR))); /* * Re-configure the staging path on Windows if the file system is localFs. * We need to use a absolute path that contains the drive letter. The unit * test could run on a different drive than the AM. We can run into the * issue that job files are localized to the drive where the test runs on, * while the AM starts on a different drive and fails to find the job * metafiles. Using absolute path can avoid this ambiguity. */ if (Path.WINDOWS) { if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath()); } } FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf); if (fc.util().exists(stagingPath)) { LOG.info(stagingPath + " exists! deleting..."); fc.delete(stagingPath, true); } LOG.info("mkdir: " + stagingPath); fc.mkdir(stagingPath, null, true); //mkdir done directory as well String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf); Path doneDirPath = fc.makeQualified(new Path(doneDir)); fc.mkdir(doneDirPath, null, true); } catch (IOException e) { throw new TezUncheckedException("Could not create staging directory. ", e); } conf.set(MRConfig.MASTER_ADDRESS, "test"); //configure the shuffle service in NM conf.setStrings(YarnConfiguration.NM_AUX_SERVICES, new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID }); conf.setClass( String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class, Service.class); // Non-standard shuffle port conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0); conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class, ContainerExecutor.class); // TestMRJobs is for testing non-uberized operation only; see TestUberAM // for corresponding uberized tests. conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); super.serviceInit(conf); }
From source file:org.apache.tez.tests.MiniTezClusterWithTimeline.java
License:Apache License
@Override public void serviceInit(Configuration conf) throws Exception { conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME); // Use libs from cluster since no build is available conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true); // blacklisting disabled to prevent scheduling issues conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath()); }//ww w. j ava 2 s.c o m if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) { // nothing defined. set quick delete value conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l); } File appJarLocalFile = new File(MiniTezClusterWithTimeline.APPJAR); if (!appJarLocalFile.exists()) { String message = "TezAppJar " + MiniTezClusterWithTimeline.APPJAR + " not found. Exiting."; LOG.info(message); throw new TezUncheckedException(message); } else { LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath()); } FileSystem fs = FileSystem.get(conf); Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir")); Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar"); // Copy AppJar and make it public. Path appMasterJar = new Path(MiniTezClusterWithTimeline.APPJAR); fs.copyFromLocalFile(appMasterJar, appRemoteJar); fs.setPermission(appRemoteJar, new FsPermission("777")); conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString()); LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS)); // VMEM monitoring disabled, PMEM monitoring enabled. conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000"); try { Path stagingPath = FileContext.getFileContext(conf) .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR))); /* * Re-configure the staging path on Windows if the file system is localFs. * We need to use a absolute path that contains the drive letter. The unit * test could run on a different drive than the AM. We can run into the * issue that job files are localized to the drive where the test runs on, * while the AM starts on a different drive and fails to find the job * metafiles. Using absolute path can avoid this ambiguity. */ if (Path.WINDOWS) { if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath()); } } FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf); if (fc.util().exists(stagingPath)) { LOG.info(stagingPath + " exists! deleting..."); fc.delete(stagingPath, true); } LOG.info("mkdir: " + stagingPath); fc.mkdir(stagingPath, null, true); //mkdir done directory as well String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf); Path doneDirPath = fc.makeQualified(new Path(doneDir)); fc.mkdir(doneDirPath, null, true); } catch (IOException e) { throw new TezUncheckedException("Could not create staging directory. ", e); } conf.set(MRConfig.MASTER_ADDRESS, "test"); //configure the shuffle service in NM conf.setStrings(YarnConfiguration.NM_AUX_SERVICES, new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID }); conf.setClass( String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class, Service.class); // Non-standard shuffle port conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0); conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class, ContainerExecutor.class); // TestMRJobs is for testing non-uberized operation only; see TestUberAM // for corresponding uberized tests. conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); super.serviceInit(conf); }
From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java
License:Open Source License
static void parseCommandLineArgs(String[] argv, Configuration conf) { CommandLineParser parser = new PosixParser(); Options options = new Options(); Option gatkdLocOpt = OptionBuilder.withArgName("depjar_loc").hasArg() .withDescription("Complete HDFS path of gatk dependency jar").create("djarloc"); options.addOption(gatkdLocOpt);// w ww . j a v a2 s. c o m Option bwaLocOpt = OptionBuilder.withArgName("bwa_loc").hasArg() .withDescription("Complete HDFS path of bwa binary or bwa.exe file").create("bwaloc"); options.addOption(bwaLocOpt); Option fq1Opt = OptionBuilder.withArgName("fastq_file1").hasArg() .withDescription("Complete HDFS path or path relative to user directory for 1st fastq file") .create("r1"); options.addOption(fq1Opt); Option fq2Opt = OptionBuilder.withArgName("fastq_file2").hasArg() .withDescription("Complete HDFS path or path relative to user directory for 2nd fastq file") .create("r2"); options.addOption(fq2Opt); Option bamOpt = OptionBuilder.withArgName("bam_directory").hasArg() .withDescription( "Complete HDFS directory path or path relative to user directory for input BAM file") .create("b"); options.addOption(bamOpt); Option outOpt = OptionBuilder.withArgName("output_directory").hasArg() .withDescription("Complete HDFS path or path relative to user directory for output directory") .create("o"); options.addOption(outOpt); Option rSizeOpt = OptionBuilder.withArgName("fastq_read_size").hasArg() .withDescription("Number of bytes of a read sequence in input FastQ file").create("rsize"); options.addOption(rSizeOpt); Option rPSplitOpt = OptionBuilder.withArgName("reads_per_map_split").hasArg() .withDescription("Optional number of reads to be processed by a mapper").create("reads_per_split"); options.addOption(rPSplitOpt); Option nRedOpt = OptionBuilder.withArgName("number_of_reducers").hasArg() .withDescription("Optional number of reducers").create("nred"); options.addOption(nRedOpt); Option nThreadOpt = OptionBuilder.withArgName("number_of_threads").hasArg() .withDescription("Optional number of threads").create("nthreads"); options.addOption(nThreadOpt); Option refFileOpt = OptionBuilder.withArgName("path_to_reference_dir").hasArg() .withDescription("Complete HDFS path of reference directory").create("ref"); options.addOption(refFileOpt); Option kSiteFileOpt = OptionBuilder.withArgName("path_to_knownsites_dir").hasArg() .withDescription("Complete HDFS path of known-sites db directory").create("dbfile"); options.addOption(kSiteFileOpt); Option platformOpt = OptionBuilder.withArgName("Linux/Windows").hasArg() .withDescription("Platform to run on").create("p"); options.addOption(platformOpt); Option noAlignOpt = new Option("na", "noalign", false, "Don't run Alignment stage"); options.addOption(noAlignOpt); Option noReAlignOpt = new Option("nra", "norealign", false, "Do not run Local Realignment stage"); options.addOption(noReAlignOpt); Option noMarkDupOpt = new Option("nmd", "nomarkdup", false, "Do not run Mark Duplicates stage"); options.addOption(noMarkDupOpt); Option noQRecabOpt = new Option("nqr", "noqrecab", false, "Do not run Quality Recalibration stage"); options.addOption(noQRecabOpt); Option noVarOpt = new Option("nv", "novariant", false, "Do not run Structural Variant stage"); options.addOption(noVarOpt); Option noFVarOpt = new Option("nfv", "nofvariant", false, "Do not run Filter Variant stage"); options.addOption(noFVarOpt); Option noMerOpt = new Option("nm", "nomresults", false, "Do not Merge Results"); options.addOption(noMerOpt); Option isXVariantOpt = new Option("xv", "xvariant", false, "enable flag, if variant calling should be done independently for INDELs and SNPs"); options.addOption(isXVariantOpt); try { // parse the command line arguments String[] args = new GenericOptionsParser(conf, options, argv).getRemainingArgs(); CommandLine line = parser.parse(options, args); if (line.hasOption(noAlignOpt.getOpt())) noalign = true; if (line.hasOption(noReAlignOpt.getOpt())) norealign = true; if (line.hasOption(noMarkDupOpt.getOpt())) nomarkdup = true; if (line.hasOption(noQRecabOpt.getOpt())) noqrecab = true; if (line.hasOption(noVarOpt.getOpt())) novariant = true; if (line.hasOption(noFVarOpt.getOpt())) nofvariant = true; if (line.hasOption(noMerOpt.getOpt())) nomresults = true; if (line.hasOption(fq1Opt.getOpt()) && line.hasOption(bamOpt.getOpt())) { throw new ParseException( "Invalid Usage: fastq file and BAM file cannot be given together as input"); } if (line.hasOption(fq2Opt.getOpt()) && !line.hasOption(fq1Opt.getOpt())) { throw new ParseException("Invalid Usage: fastq file2 is invalid without fastq file1"); } if (!line.hasOption(fq2Opt.getOpt()) && !line.hasOption(fq1Opt.getOpt()) && !line.hasOption(bamOpt.getOpt())) { throw new ParseException( "Invalid Usage: Either the fastq file or BAM file has to be provided as input"); } if (line.hasOption(gatkdLocOpt.getOpt())) { gatk_binary_loc = line.getOptionValue(gatkdLocOpt.getOpt()); validatePath(gatk_binary_loc, conf); } else { throw new ParseException( "Invalid Usage: GATK dependency jar location (-djarloc) is mandatory for running the pipeline"); } if (!noalign) { if (line.hasOption(fq1Opt.getOpt())) { readFile1 = line.getOptionValue(fq1Opt.getOpt()); validatePath(readFile1, conf); fqInput = (new Path(readFile1).getParent()).toString(); } if (line.hasOption(fq2Opt.getOpt())) { readFile2 = line.getOptionValue(fq2Opt.getOpt()); conf.setBoolean("gatk.hadoop.pairedend", true); validatePath(readFile2, conf); conf.set("gatk.hadoop.readfile2", readFile2); ; } if (line.hasOption(rSizeOpt.getOpt())) { fq_read_size = Integer.parseInt(line.getOptionValue(rSizeOpt.getOpt())); } else { throw new ParseException("Invalid Usage: read size (-rsize) is mandatory for Alignment"); } if (line.hasOption(bwaLocOpt.getOpt())) { bwa_binary_loc = line.getOptionValue(bwaLocOpt.getOpt()); validatePath(bwa_binary_loc, conf); } else { throw new ParseException( "Invalid Usage: bwa binary/exe location (-bwaloc) is mandatory for Alignment"); } if (line.hasOption(rPSplitOpt.getOpt())) { reads_per_split = Integer.parseInt(line.getOptionValue(rPSplitOpt.getOpt())); } } if (line.hasOption(nRedOpt.getOpt())) { nReducers = Integer.parseInt(line.getOptionValue(nRedOpt.getOpt())); } if (line.hasOption(nThreadOpt.getOpt())) { nThreads = Integer.parseInt(line.getOptionValue(nThreadOpt.getOpt())); conf.setInt("gatk.hadoop.nthreads", nThreads); } if (line.hasOption(bamOpt.getOpt())) { int rcount = 0; BAMInputPath = line.getOptionValue(bamOpt.getOpt()); validatePath(BAMInputPath, conf); Path BAMPath = new Path(BAMInputPath); FileSystem fs = BAMPath.getFileSystem(conf); FileStatus[] content = fs.listStatus(BAMPath); for (int i = 0; i < content.length; i++) { String filename = content[i].getPath().getName(); if (filename.endsWith(".bam")) { String prefix = filename.substring(0, 6); try { Long value = Long.valueOf(prefix); } catch (NumberFormatException e) { String tmpFile = BAMInputPath + Path.SEPARATOR + String.format("%06d", rcount) + "-" + filename; boolean rename = fs.rename(content[i].getPath(), new Path(tmpFile)); } rcount++; } } } if (line.hasOption(outOpt.getOpt())) { outputDir = line.getOptionValue(outOpt.getOpt()); if (!(new Path(outputDir).getFileSystem(conf).mkdirs(new Path(outputDir)))) { throw new Exception("MKDIR failure"); } if (!noalign) { BWAOutPath = outputDir + Path.SEPARATOR + "AlignerOut"; SortBWAOutPath = outputDir + Path.SEPARATOR + "SortedAlignerOut"; BAMInputPath = outputDir + Path.SEPARATOR + "BAMInput"; } IndelOutPath = outputDir + Path.SEPARATOR + "IndelRealignOut"; RmdupOutPath = outputDir + Path.SEPARATOR + "DedupOut"; RecalOutPath = outputDir + Path.SEPARATOR + "RecalibrationOut"; FinalBAMPath = outputDir + Path.SEPARATOR + "FinalBAMOut"; } else { throw new ParseException("Invalid Usage: output directory is mandatory"); } if (line.hasOption(refFileOpt.getOpt())) { Path refFileDir = new Path(line.getOptionValue(refFileOpt.getOpt())); FileSystem fs = refFileDir.getFileSystem(conf); FileStatus[] content = fs.listStatus(refFileDir); for (int i = 0; i < content.length; i++) { if ((content[i].getPath().getName()).endsWith(".fa") || (content[i].getPath().getName()).endsWith(".fasta")) { refFileLoc = content[i].getPath().toString(); } } validatePath(refFileLoc, conf); refFileName = refFileLoc.substring(0, refFileLoc.lastIndexOf(".")); } else { throw new ParseException("Invalid Usage: reference fasta file is mandatory"); } if (line.hasOption(kSiteFileOpt.getOpt())) { Path knownSitesDir = new Path(line.getOptionValue(kSiteFileOpt.getOpt())); FileSystem fs = knownSitesDir.getFileSystem(conf); FileStatus[] content = fs.listStatus(knownSitesDir); for (int i = 0; i < content.length; i++) { if ((content[i].getPath().getName()).endsWith(".vcf")) { knownSitesLoc = content[i].getPath().toString(); } } validatePath(knownSitesLoc, conf); } if (line.hasOption(platformOpt.getOpt())) { platform = line.getOptionValue(platformOpt.getOpt()); if (platform.equalsIgnoreCase("Linux")) { is_azure = false; conf.setBoolean("gatk.hadoop.isazure", false); } } if (line.hasOption(isXVariantOpt.getOpt())) { xVariantCall = true; } } catch (ParseException exp) { System.out.println(exp.getMessage()); if (printUsage) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("hadoop jar {/local/path/to/SeqInCloud.jar} {options}", options); } System.exit(-1); } catch (Exception exp) { System.out.println("Command line parsing error: " + exp.getMessage()); System.exit(-1); } }
From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java
License:Open Source License
@Override public int run(String[] argv) throws Exception { try {/*from ww w . j a va2 s. co m*/ Configuration conf; FileSystem srcFs, outFs, fs; Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath; int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100; FileStatus[] content; ClusterStatus status; int numNodes, mapSlotsPerNode; long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime, splitSize; float inputBufpcnt; FSDataOutputStream out; FSDataInputStream in; SAMFileReader fileReader; InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler; double sampling_frequency = 0.01; // Job object can be used for Aligner job if enabled conf = getConf(); Job job = new Job(conf); parseCommandLineArgs(argv, conf); maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks(); maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks(); if (!noalign) { System.out.println("Starting Alignment Job"); startTime = System.currentTimeMillis(); status = new JobClient(new JobConf(conf)).getClusterStatus(); numNodes = status.getTaskTrackers(); // Job specific setting of number of Reducers.. if (nReducers == 0) nReducers = numNodes; conf.setInt("mapred.reduce.tasks", nReducers); Path refPath = new Path(refFileLoc); fs = refPath.getFileSystem(conf); blockSize = fs.getFileStatus(refPath).getBlockSize(); splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize()); if (reads_per_split == 0) { inputPath = new Path(readFile1); long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen(); long numSplits = Math.round(readSize / splitSize); if (numSplits < maxMapTasks) numSplits = maxMapTasks; if (numSplits < nReducers) numSplits = nReducers; long numReads = Math.round(readSize / (long) fq_read_size); reads_per_split = numReads / numSplits; // Total Order Partitioner if ((double) reads_per_split <= (1 / sampling_frequency)) { sampling_frequency = 1; granularity = 1; } else if (((double) reads_per_split > (1 / sampling_frequency)) && ((double) reads_per_split <= (1 / sampling_frequency * 100))) { sampling_frequency = 0.1; granularity = 10; } } job.setJarByClass(GATKJobClient.class); job.setInputFormatClass(NLineXInputFormat.class); FileInputFormat.addInputPath(job, new Path(fqInput)); FileOutputFormat.setOutputPath(job, new Path(BWAOutPath)); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration()); if (!is_azure) { DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"), job.getConfiguration()); } else { DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"), job.getConfiguration()); } DistributedCache.createSymlink(job.getConfiguration()); // Setting local.cache.size - Add up the size of the files // distributed through the cache cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen() + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen(); if (!is_azure) { cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen(); } if (cacheSize > 8 * 1024 * 1024 * 1024) { conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024)); } conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs.. conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); // conf.setBoolean("mapred.map.tasks.speculative.execution", false); // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(BWAPartitioner.class); job.setReducerClass(BWAReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); if (job.waitForCompletion(true)) { System.out.println("BWA Alignment done"); } content = fs.listStatus(new Path(BWAOutPath)); for (int i = 0; i < content.length; i++) { if (!((content[i].getPath().getName()).endsWith(".bam")) && !((content[i].getPath().getName()).startsWith("_"))) { fs.delete(content[i].getPath(), false); } } endTime = System.currentTimeMillis(); System.out.println("BWA Alignment took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Splitting BAM Indexing Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BWAOutPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1"); FileOutputFormat.setOutputPath(job, output); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); conf.setInt("gatk.hadoop.granularity", granularity); conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.isindex", false); conf.setBoolean("gatk.hadoop.ismarkdup", false); job.setMapperClass(IndexMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("SplittingBAM Indexing job done"); } output.getFileSystem(conf).delete(output, true); endTime = System.currentTimeMillis(); System.out.println("Splitting BAM Indexing took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Sort Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (norealign && nomarkdup && noqrecab && novariant && !nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); inputPath = new Path(BWAOutPath); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath)); job.setInputFormatClass(ContigInputFormat.class); job.setPartitionerClass(ContigPartitioner.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); fs = inputPath.getFileSystem(conf); content = fs.listStatus(inputPath); for (int i = 0; i < content.length; i++) { if (content[i].getPath().getName().endsWith(".bam")) { in = fs.open(content[i].getPath()); List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader()) .getSequenceDictionary().getSequences(); conf.setInt("mapred.reduce.tasks", sequences.size()); break; } } conf.setLong("mapred.task.timeout", 86400000L); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); //conf.setBoolean("mapred.map.tasks.speculative.execution", false); //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); if (job.waitForCompletion(true)) { System.out.println("Sort completed successfully"); } endTime = System.currentTimeMillis(); System.out.println("Sort job took: " + (endTime - startTime)); } if (!norealign) { if (!noalign) BAMInputPath = SortBWAOutPath; startTime = System.currentTimeMillis(); System.out.println("Starting Indexing Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2"); FileOutputFormat.setOutputPath(job, output); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); conf.setBoolean("gatk.hadoop.isindex", true); conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.ismarkdup", false); job.setMapperClass(IndexMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Indexing job done"); } output.getFileSystem(conf).delete(output, true); endTime = System.currentTimeMillis(); System.out.println("Indexing job took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Realigner Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(BAMInputFormat.class); srcFs = new Path(outputDir).getFileSystem(conf); if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition"))) System.out.println("mkdir failed"); inputDir = new Path(outputDir + Path.SEPARATOR + "Partition"); inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); partition = new Path(inputDir, "_partition"); job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partition); try { URI partitionURI = new URI(partition.toString() + "#_partition"); DistributedCache.addCacheFile(partitionURI, conf); } catch (URISyntaxException e) { assert false; } if (nReducers == 0) { if (!nomarkdup || !noqrecab || !novariant) { conf.setInt("mapred.reduce.tasks", maxMapTasks); } else { conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10)); } } else { conf.setInt("mapred.reduce.tasks", nReducers); } conf.setLong("mapred.task.timeout", 86400000L); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 if (nomarkdup && noqrecab && novariant && !nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setMapperClass(IndelMapper.class); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(IndelOutPath)); sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency, max_splits); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler); job.setInputFormatClass(LociInputFormat.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Indel realignment done"); } endTime = System.currentTimeMillis(); System.out.println("Indel Realigner took: " + (endTime - startTime)); } if (!nomarkdup || !noqrecab || !novariant) { /* * MarkDuplicate and Indexing Job * FixMateInformation is not required as it is handled * automatically by GATK after IndelRealignment. */ System.out.println("Starting MarkDup/Indexing job"); startTime = System.currentTimeMillis(); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); if (!nomarkdup) { System.out.println("Starting MarkDuplicates job"); conf.setBoolean("gatk.hadoop.ismarkdup", true); FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath)); } if (!noqrecab || !novariant) { conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.isindex", true); if (nomarkdup) { System.out.println("Starting Indexing job"); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3")); } } job.setMapperClass(IndexMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Markdup/Indexing job done !!!"); } Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"); fs = toDelete.getFileSystem(conf); if (fs.exists(toDelete)) { fs.delete(toDelete, true); } if (!nomarkdup) { Path rmdupOutPath = new Path(RmdupOutPath); fs = rmdupOutPath.getFileSystem(conf); content = fs.listStatus(rmdupOutPath); for (int i = 0; i < content.length; i++) { if ((content[i].getPath().getName()).startsWith("part")) { fs.delete(content[i].getPath(), false); } } endTime = System.currentTimeMillis(); System.out.println("MarkDuplicates took: " + (endTime - startTime)); } else { endTime = System.currentTimeMillis(); System.out.println("Indexing took: " + (endTime - startTime)); } } if (!noqrecab) { startTime = System.currentTimeMillis(); System.out.println("Starting Recal - Count Covariates Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (!nomarkdup) inputPath = new Path(RmdupOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(LociInputFormat.class); conf.setLong("local.cache.size", 20106127360L); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.set("gatk.hadoop.outputpath", outputDir); // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1); // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1); // conf.setBoolean("mapred.map.tasks.speculative.execution", false); // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1 conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setMapperClass(RecalCovMapper.class); job.setCombinerClass(RecalCovCombiner.class); job.setReducerClass(RecalCovReducer.class); job.setMapOutputKeyClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("CountCovariates done"); } endTime = System.currentTimeMillis(); System.out.println("CountCovariates took: " + (endTime - startTime)); } if (!noqrecab || !novariant) { startTime = System.currentTimeMillis(); System.out.println("Starting Table Recalibration / Unified Genotyper Job"); if (!nomarkdup) inputPath = new Path(RmdupOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); FileInputFormat.addInputPath(job, inputPath); if (!noqrecab) { conf.setBoolean("gatk.hadoop.recab", true); if (norealign) { job.setInputFormatClass(BAMInputFormat.class); srcFs = new Path(outputDir).getFileSystem(conf); if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition"))) System.out.println("mkdir failed"); } else { job.setInputFormatClass(LociInputFormat.class); } inputDir = new Path(outputDir + "/" + "Partition"); inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); partition = new Path(inputDir, "_partition"); job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partition); try { URI partitionURI = new URI(partition.toString() + "#_partition"); DistributedCache.addCacheFile(partitionURI, conf); } catch (URISyntaxException e) { assert false; } if (nReducers == 0) { conf.setInt("mapred.reduce.tasks", maxMapTasks); } else { conf.setInt("mapred.reduce.tasks", nReducers); } conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); if (!nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(RecalOutPath)); } else { job.setInputFormatClass(LociInputFormat.class); conf.setInt("mapred.reduce.tasks", 0); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4")); } job.setMapperClass(RecalMapper.class); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); conf.set("gatk.hadoop.outputpath", outputDir); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); if (!novariant) { conf.setBoolean("gatk.hadoop.variant", true); if (!nofvariant) conf.setBoolean("gatk.hadoop.fvariant", true); conf.setInt("gatk.hadoop.nthreads", nThreads); conf.setBoolean("gatk.hadoop.xvariant", xVariantCall); } if (!noqrecab && norealign) { sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency, max_splits); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler); job.setInputFormatClass(LociInputFormat.class); } DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("TableRecalibration Job done !!"); } endTime = System.currentTimeMillis(); Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"); fs = toDelete.getFileSystem(conf); if (fs.exists(toDelete)) { fs.delete(toDelete, true); } System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime)); } if (!novariant && !nomresults) { startTime = System.currentTimeMillis(); System.out.println("Merge Variant Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut"); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setReducerClass(VariantReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Merge Variants done"); } endTime = System.currentTimeMillis(); System.out.println("MergeVariant job took: " + (endTime - startTime)); if (xVariantCall && !novariant && !nomresults) { startTime = System.currentTimeMillis(); System.out.println("Merge INDEL Variant Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut"); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setReducerClass(VariantReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Merge INDEL Variants done"); } endTime = System.currentTimeMillis(); System.out.println("MergeINDELVariant job took: " + (endTime - startTime)); } } if (!nomresults) { startTime = System.currentTimeMillis(); System.out.println("Starting Merge BAM Job"); outputPath = new Path(FinalBAMPath); outFs = outputPath.getFileSystem(conf); if (!outFs.mkdirs(outputPath)) System.out.println("mkdir failed"); // Currently no support to merge output from MarkDuplicates // from Job Client. Need to have a separate MR job for it. if (!noqrecab) inputPath = new Path(RecalOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else if (!nomarkdup) throw new Exception("Merge not implemented MarkDuplicates output."); else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant) inputPath = new Path(BAMInputPath); fs = inputPath.getFileSystem(conf); content = fs.listStatus(inputPath); mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam"); Path p = null; int nfiles = 0; for (int i = 0; i < content.length; i++) { p = content[i].getPath(); ++nfiles; } if (nfiles == 1) { boolean rename = fs.rename(p, mergeOutFile); } else { out = outFs.create(mergeOutFile, true); for (int i = 0; i < content.length; i++) { p = content[i].getPath(); if ((p.getName()).endsWith(".bam")) { in = fs.open(p); IOUtils.copyBytes(in, out, conf, false); in.close(); } } out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); out.close(); } endTime = System.currentTimeMillis(); System.out.println("Final Merge took: " + (endTime - startTime)); } System.out.println("JobCompleted"); } catch (IOException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (InterruptedException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (ClassNotFoundException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (Exception e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } return 0; }
From source file:org.commoncrawl.mapred.ec2.parser.OutputCommitter.java
License:Open Source License
Path getTempTaskOutputPath(TaskAttemptContext taskContext) { JobConf conf = taskContext.getJobConf(); Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { Path p = new Path(outputPath, (FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR + "_" + taskContext.getTaskAttemptID().toString())); try {/* ww w . j a v a 2 s .c o m*/ FileSystem fs = p.getFileSystem(conf); return p.makeQualified(fs); } catch (IOException ie) { LOG.warn(StringUtils.stringifyException(ie)); return p; } } return null; }