List of usage examples for org.apache.hadoop.io IOUtils copyBytes
public static void copyBytes(InputStream in, OutputStream out, long count, boolean close) throws IOException
From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java
License:Open Source License
public int run(String[] args) throws Exception { final Configuration conf = getConf(); SortVcfOptions options = new SortVcfOptions(args); conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat()); conf.setBoolean("hadoopbam.vcf.write-header", false); Path inputPath = new Path(options.getInput()); FileSystem fs = inputPath.getFileSystem(conf); FileStatus[] files = fs.listStatus(inputPath); Path vcfHeaderPath = files[0].getPath(); if (options.getVcfHeader() != null) vcfHeaderPath = new Path(options.getVcfHeader()); if (files.length <= 0) { System.err.println("Input dir is empty!"); return 1; }// w ww. j av a 2s .c om conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString()); conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName()); KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf); baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf)); VCFHeader vcfHeader = baseOF.getHeader(); Job job = Job.getInstance(conf, "VCFSort"); job.setJarByClass(SortVcf.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortVcfReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VariantContextWritable.class); job.setInputFormatClass(VCFInputFormat.class); job.setOutputFormatClass(MyVCFOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setNumReduceTasks(options.getReducerNum()); SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss"); String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date()); Path partTmp = new Path(tmpDir + "/temp"); VCFInputFormat.addInputPath(job, inputPath); if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job)) VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE); FileOutputFormat.setOutputPath(job, partTmp); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class); Path partitionFile; if (options.getPartitionFileString() == null) { partitionFile = new Path(tmpDir + "/_partitons.lst"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); System.out.println("vcf-sort :: Sampling..."); int numSamples = options.getNumSamples(); if (fs.getContentSummary(inputPath).getLength() < 10000000) { numSamples = 1; job.setNumReduceTasks(1); } InputSampler.writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples, numSamples)); } else { System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ..."); partitionFile = new Path(options.getPartitionFileString()); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); } if (!job.waitForCompletion(true)) { System.err.println("sort :: Job failed."); return 1; } final FileSystem srcFS = partTmp.getFileSystem(conf); Path headerPath = new Path(tmpDir + "/header.vcf.gz"); BGZFCodec bgzfCodec = new BGZFCodec(); OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath)); VariantContextWriterBuilder builder = new VariantContextWriterBuilder(); VariantContextWriter writer; writer = builder.setOutputVCFStream(new FilterOutputStream(os) { @Override public void close() throws IOException { this.out.flush(); } }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build(); writer.writeHeader(vcfHeader); os.close(); Path outputPath = new Path(options.getOutput()); final FileSystem dstFS = outputPath.getFileSystem(conf); OutputStream vcfgz = dstFS.create(outputPath); final FSDataInputStream headerIns = srcFS.open(headerPath); IOUtils.copyBytes(headerIns, vcfgz, conf, false); headerIns.close(); final FileStatus[] parts = partTmp.getFileSystem(conf) .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*")); for (FileStatus p : parts) { final FSDataInputStream ins = srcFS.open(p.getPath()); IOUtils.copyBytes(ins, vcfgz, conf, false); ins.close(); } vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); vcfgz.close(); partTmp.getFileSystem(conf).delete(partTmp, true); return 0; }
From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java
License:Open Source License
@Override public int run(String[] argv) throws Exception { try {//from w w w. j a v a 2 s.c om Configuration conf; FileSystem srcFs, outFs, fs; Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath; int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100; FileStatus[] content; ClusterStatus status; int numNodes, mapSlotsPerNode; long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime, splitSize; float inputBufpcnt; FSDataOutputStream out; FSDataInputStream in; SAMFileReader fileReader; InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler; double sampling_frequency = 0.01; // Job object can be used for Aligner job if enabled conf = getConf(); Job job = new Job(conf); parseCommandLineArgs(argv, conf); maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks(); maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks(); if (!noalign) { System.out.println("Starting Alignment Job"); startTime = System.currentTimeMillis(); status = new JobClient(new JobConf(conf)).getClusterStatus(); numNodes = status.getTaskTrackers(); // Job specific setting of number of Reducers.. if (nReducers == 0) nReducers = numNodes; conf.setInt("mapred.reduce.tasks", nReducers); Path refPath = new Path(refFileLoc); fs = refPath.getFileSystem(conf); blockSize = fs.getFileStatus(refPath).getBlockSize(); splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize()); if (reads_per_split == 0) { inputPath = new Path(readFile1); long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen(); long numSplits = Math.round(readSize / splitSize); if (numSplits < maxMapTasks) numSplits = maxMapTasks; if (numSplits < nReducers) numSplits = nReducers; long numReads = Math.round(readSize / (long) fq_read_size); reads_per_split = numReads / numSplits; // Total Order Partitioner if ((double) reads_per_split <= (1 / sampling_frequency)) { sampling_frequency = 1; granularity = 1; } else if (((double) reads_per_split > (1 / sampling_frequency)) && ((double) reads_per_split <= (1 / sampling_frequency * 100))) { sampling_frequency = 0.1; granularity = 10; } } job.setJarByClass(GATKJobClient.class); job.setInputFormatClass(NLineXInputFormat.class); FileInputFormat.addInputPath(job, new Path(fqInput)); FileOutputFormat.setOutputPath(job, new Path(BWAOutPath)); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration()); if (!is_azure) { DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"), job.getConfiguration()); } else { DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"), job.getConfiguration()); } DistributedCache.createSymlink(job.getConfiguration()); // Setting local.cache.size - Add up the size of the files // distributed through the cache cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen() + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen(); if (!is_azure) { cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen(); } if (cacheSize > 8 * 1024 * 1024 * 1024) { conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024)); } conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs.. conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); // conf.setBoolean("mapred.map.tasks.speculative.execution", false); // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(BWAPartitioner.class); job.setReducerClass(BWAReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); if (job.waitForCompletion(true)) { System.out.println("BWA Alignment done"); } content = fs.listStatus(new Path(BWAOutPath)); for (int i = 0; i < content.length; i++) { if (!((content[i].getPath().getName()).endsWith(".bam")) && !((content[i].getPath().getName()).startsWith("_"))) { fs.delete(content[i].getPath(), false); } } endTime = System.currentTimeMillis(); System.out.println("BWA Alignment took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Splitting BAM Indexing Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BWAOutPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1"); FileOutputFormat.setOutputPath(job, output); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); conf.setInt("gatk.hadoop.granularity", granularity); conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.isindex", false); conf.setBoolean("gatk.hadoop.ismarkdup", false); job.setMapperClass(IndexMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("SplittingBAM Indexing job done"); } output.getFileSystem(conf).delete(output, true); endTime = System.currentTimeMillis(); System.out.println("Splitting BAM Indexing took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Sort Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (norealign && nomarkdup && noqrecab && novariant && !nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); inputPath = new Path(BWAOutPath); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath)); job.setInputFormatClass(ContigInputFormat.class); job.setPartitionerClass(ContigPartitioner.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); fs = inputPath.getFileSystem(conf); content = fs.listStatus(inputPath); for (int i = 0; i < content.length; i++) { if (content[i].getPath().getName().endsWith(".bam")) { in = fs.open(content[i].getPath()); List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader()) .getSequenceDictionary().getSequences(); conf.setInt("mapred.reduce.tasks", sequences.size()); break; } } conf.setLong("mapred.task.timeout", 86400000L); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); //conf.setBoolean("mapred.map.tasks.speculative.execution", false); //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); if (job.waitForCompletion(true)) { System.out.println("Sort completed successfully"); } endTime = System.currentTimeMillis(); System.out.println("Sort job took: " + (endTime - startTime)); } if (!norealign) { if (!noalign) BAMInputPath = SortBWAOutPath; startTime = System.currentTimeMillis(); System.out.println("Starting Indexing Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2"); FileOutputFormat.setOutputPath(job, output); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); conf.setBoolean("gatk.hadoop.isindex", true); conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.ismarkdup", false); job.setMapperClass(IndexMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Indexing job done"); } output.getFileSystem(conf).delete(output, true); endTime = System.currentTimeMillis(); System.out.println("Indexing job took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Realigner Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(BAMInputFormat.class); srcFs = new Path(outputDir).getFileSystem(conf); if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition"))) System.out.println("mkdir failed"); inputDir = new Path(outputDir + Path.SEPARATOR + "Partition"); inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); partition = new Path(inputDir, "_partition"); job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partition); try { URI partitionURI = new URI(partition.toString() + "#_partition"); DistributedCache.addCacheFile(partitionURI, conf); } catch (URISyntaxException e) { assert false; } if (nReducers == 0) { if (!nomarkdup || !noqrecab || !novariant) { conf.setInt("mapred.reduce.tasks", maxMapTasks); } else { conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10)); } } else { conf.setInt("mapred.reduce.tasks", nReducers); } conf.setLong("mapred.task.timeout", 86400000L); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 if (nomarkdup && noqrecab && novariant && !nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setMapperClass(IndelMapper.class); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(IndelOutPath)); sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency, max_splits); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler); job.setInputFormatClass(LociInputFormat.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Indel realignment done"); } endTime = System.currentTimeMillis(); System.out.println("Indel Realigner took: " + (endTime - startTime)); } if (!nomarkdup || !noqrecab || !novariant) { /* * MarkDuplicate and Indexing Job * FixMateInformation is not required as it is handled * automatically by GATK after IndelRealignment. */ System.out.println("Starting MarkDup/Indexing job"); startTime = System.currentTimeMillis(); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); if (!nomarkdup) { System.out.println("Starting MarkDuplicates job"); conf.setBoolean("gatk.hadoop.ismarkdup", true); FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath)); } if (!noqrecab || !novariant) { conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.isindex", true); if (nomarkdup) { System.out.println("Starting Indexing job"); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3")); } } job.setMapperClass(IndexMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Markdup/Indexing job done !!!"); } Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"); fs = toDelete.getFileSystem(conf); if (fs.exists(toDelete)) { fs.delete(toDelete, true); } if (!nomarkdup) { Path rmdupOutPath = new Path(RmdupOutPath); fs = rmdupOutPath.getFileSystem(conf); content = fs.listStatus(rmdupOutPath); for (int i = 0; i < content.length; i++) { if ((content[i].getPath().getName()).startsWith("part")) { fs.delete(content[i].getPath(), false); } } endTime = System.currentTimeMillis(); System.out.println("MarkDuplicates took: " + (endTime - startTime)); } else { endTime = System.currentTimeMillis(); System.out.println("Indexing took: " + (endTime - startTime)); } } if (!noqrecab) { startTime = System.currentTimeMillis(); System.out.println("Starting Recal - Count Covariates Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (!nomarkdup) inputPath = new Path(RmdupOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(LociInputFormat.class); conf.setLong("local.cache.size", 20106127360L); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.set("gatk.hadoop.outputpath", outputDir); // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1); // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1); // conf.setBoolean("mapred.map.tasks.speculative.execution", false); // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1 conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setMapperClass(RecalCovMapper.class); job.setCombinerClass(RecalCovCombiner.class); job.setReducerClass(RecalCovReducer.class); job.setMapOutputKeyClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("CountCovariates done"); } endTime = System.currentTimeMillis(); System.out.println("CountCovariates took: " + (endTime - startTime)); } if (!noqrecab || !novariant) { startTime = System.currentTimeMillis(); System.out.println("Starting Table Recalibration / Unified Genotyper Job"); if (!nomarkdup) inputPath = new Path(RmdupOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); FileInputFormat.addInputPath(job, inputPath); if (!noqrecab) { conf.setBoolean("gatk.hadoop.recab", true); if (norealign) { job.setInputFormatClass(BAMInputFormat.class); srcFs = new Path(outputDir).getFileSystem(conf); if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition"))) System.out.println("mkdir failed"); } else { job.setInputFormatClass(LociInputFormat.class); } inputDir = new Path(outputDir + "/" + "Partition"); inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); partition = new Path(inputDir, "_partition"); job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partition); try { URI partitionURI = new URI(partition.toString() + "#_partition"); DistributedCache.addCacheFile(partitionURI, conf); } catch (URISyntaxException e) { assert false; } if (nReducers == 0) { conf.setInt("mapred.reduce.tasks", maxMapTasks); } else { conf.setInt("mapred.reduce.tasks", nReducers); } conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); if (!nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(RecalOutPath)); } else { job.setInputFormatClass(LociInputFormat.class); conf.setInt("mapred.reduce.tasks", 0); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4")); } job.setMapperClass(RecalMapper.class); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); conf.set("gatk.hadoop.outputpath", outputDir); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); if (!novariant) { conf.setBoolean("gatk.hadoop.variant", true); if (!nofvariant) conf.setBoolean("gatk.hadoop.fvariant", true); conf.setInt("gatk.hadoop.nthreads", nThreads); conf.setBoolean("gatk.hadoop.xvariant", xVariantCall); } if (!noqrecab && norealign) { sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency, max_splits); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler); job.setInputFormatClass(LociInputFormat.class); } DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("TableRecalibration Job done !!"); } endTime = System.currentTimeMillis(); Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"); fs = toDelete.getFileSystem(conf); if (fs.exists(toDelete)) { fs.delete(toDelete, true); } System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime)); } if (!novariant && !nomresults) { startTime = System.currentTimeMillis(); System.out.println("Merge Variant Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut"); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setReducerClass(VariantReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Merge Variants done"); } endTime = System.currentTimeMillis(); System.out.println("MergeVariant job took: " + (endTime - startTime)); if (xVariantCall && !novariant && !nomresults) { startTime = System.currentTimeMillis(); System.out.println("Merge INDEL Variant Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut"); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setReducerClass(VariantReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Merge INDEL Variants done"); } endTime = System.currentTimeMillis(); System.out.println("MergeINDELVariant job took: " + (endTime - startTime)); } } if (!nomresults) { startTime = System.currentTimeMillis(); System.out.println("Starting Merge BAM Job"); outputPath = new Path(FinalBAMPath); outFs = outputPath.getFileSystem(conf); if (!outFs.mkdirs(outputPath)) System.out.println("mkdir failed"); // Currently no support to merge output from MarkDuplicates // from Job Client. Need to have a separate MR job for it. if (!noqrecab) inputPath = new Path(RecalOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else if (!nomarkdup) throw new Exception("Merge not implemented MarkDuplicates output."); else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant) inputPath = new Path(BAMInputPath); fs = inputPath.getFileSystem(conf); content = fs.listStatus(inputPath); mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam"); Path p = null; int nfiles = 0; for (int i = 0; i < content.length; i++) { p = content[i].getPath(); ++nfiles; } if (nfiles == 1) { boolean rename = fs.rename(p, mergeOutFile); } else { out = outFs.create(mergeOutFile, true); for (int i = 0; i < content.length; i++) { p = content[i].getPath(); if ((p.getName()).endsWith(".bam")) { in = fs.open(p); IOUtils.copyBytes(in, out, conf, false); in.close(); } } out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); out.close(); } endTime = System.currentTimeMillis(); System.out.println("Final Merge took: " + (endTime - startTime)); } System.out.println("JobCompleted"); } catch (IOException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (InterruptedException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (ClassNotFoundException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (Exception e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } return 0; }
From source file:org.deeplearning4j.utils.ShowData2UIServer.java
License:Apache License
public static void main(String[] args) throws Exception { //load the data saved, and then it'll be showed ,in the browser:http://localhost:9000 File statsFile = null;//ww w . j av a 2 s . c om if (hdfsPath) { statsFile = File.createTempFile("tmp", "dl4j"); OutputStream os = new FileOutputStream(statsFile); FileSystem fs = CommonUtils.openHdfsConnect(); InputStream in = fs.open(new Path("/user/hadoop/trainlog/AnimalModelByHdfsTrainingStatsSpark2.dl4j")); IOUtils.copyBytes(in, os, 4096, false); //?? IOUtils.closeStream(in); CommonUtils.closeHdfsConnect(fs); os.close(); } else { statsFile = new File("/home/AnimalModelByHdfsTrainingStats1.dl4j"); } StatsStorage statsStorage = new FileStatsStorage(statsFile); UIServer uiServer = UIServer.getInstance(); uiServer.attach(statsStorage); }
From source file:org.hadoop.tdg.TestPseudoHadoop.java
License:Apache License
public void copyFileWithProgress() throws IOException { InputStream in = null;/* w w w. j av a 2s. c o m*/ FSDataOutputStream out = null; try { in = new BufferedInputStream(new FileInputStream(HOME_FILE)); // FileSystem fs = FileSystem.get(URI.create(DST), conf); out = fs.create(p, new Progressable() { @Override public void progress() { System.out.print("~"); } }); IOUtils.copyBytes(in, out, 4096, true); // Assert.assertTrue(fs.getFileStatus(p).getLen() == ); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:org.hadoop.tdg.TestPseudoHadoop.java
License:Apache License
private void printStream(InputStream is) throws IOException { File f1 = new File(HOME_FILE); File f2 = new File(HOME + "/test.cpy"); FileOutputStream fos = null;/* w ww . jav a 2s . com*/ try { fos = new FileOutputStream(f2); IOUtils.copyBytes(is, fos, 4096, false); Files.equal(f1, f2); } finally { IOUtils.closeStream(is); IOUtils.closeStream(fos); } }
From source file:org.hadoop.tdg.TestPseudoHadoop.java
License:Apache License
@Test public void writeAndReadBzipCompressed() throws IOException { BZip2Codec codec = new BZip2Codec(); String ext = codec.getDefaultExtension(); Path p = new Path(DST_FILE + ext); File f1 = new File(HOME_FILE); File f2 = new File(HOME_FILE + ext); //writing compressed to hdfs CompressionOutputStream cout = codec.createOutputStream(fs.create(p)); IOUtils.copyBytes(new FileInputStream(f1), cout, 4096, false); Assert.assertTrue(//from w w w. j av a 2 s . c o m fs.getFileStatus(p).getPath().equals(new Path(fs.getUri().toString(), p.toUri().toString()))); //reading and checking if it's the same FSDataInputStream dis = fs.open(p); //doesn't work don't know why CompressionInputStream cin = codec.createInputStream(dis); IOUtils.copyBytes(dis, new FileOutputStream(f2), 4096, false); Files.equal(f1, f2); }
From source file:org.mrgeo.data.raster.RasterWritable.java
License:Apache License
public static MrGeoRaster toMrGeoRaster(final RasterWritable writable, final CompressionCodec codec, final Decompressor decompressor) throws IOException { decompressor.reset();/*from ww w . j av a2 s. c om*/ final ByteArrayInputStream bis = new ByteArrayInputStream(writable.bytes, 0, writable.getSize()); final CompressionInputStream gis = codec.createInputStream(bis, decompressor); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true); return toMrGeoRaster(new RasterWritable(baos.toByteArray())); }
From source file:org.mrgeo.hdfs.utils.HadoopFileUtils.java
License:Apache License
@SuppressWarnings("squid:S2095") // hadoop FileSystem cannot be closed, or else subsequent uses will fail public static void copyFileToHdfs(final String fromFile, final String toFile, final boolean overwrite) throws IOException { final Path toPath = new Path(toFile); final Path fromPath = new Path(fromFile); final FileSystem srcFS = HadoopFileUtils.getFileSystem(toPath); final FileSystem dstFS = HadoopFileUtils.getFileSystem(fromPath); final Configuration conf = HadoopUtils.createConfiguration(); InputStream in = null;/*from w ww . j av a2s. c om*/ OutputStream out = null; try { in = srcFS.open(fromPath); out = dstFS.create(toPath, overwrite); IOUtils.copyBytes(in, out, conf, true); } catch (final IOException e) { IOUtils.closeStream(out); IOUtils.closeStream(in); throw e; } }
From source file:org.mrgeo.vector.mrsvector.VectorTileWritable.java
License:Apache License
public static VectorTile toMrsVector(final VectorTileWritable writable, final CompressionCodec codec, final Decompressor decompressor) throws IOException { decompressor.reset();/*from w ww.ja v a2 s .c o m*/ final ByteArrayInputStream bis = new ByteArrayInputStream(writable.getBytes(), 0, writable.getLength()); final CompressionInputStream gis = codec.createInputStream(bis, decompressor); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true); byte[] data = baos.toByteArray(); return VectorTile.fromProtobuf(data, 0, data.length); }
From source file:org.seqdoop.hadoop_bam.cli.plugins.Cat.java
License:Open Source License
@Override protected int run(final CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("cat :: OUTPATH not given."); return 3; }/*from ww w.j a v a 2 s . co m*/ if (args.size() == 1) { System.err.println("cat :: no INPATHs given."); return 3; } final Path outPath = new Path(args.get(0)); final List<String> ins = args.subList(1, args.size()); final boolean verbose = parser.getBoolean(verboseOpt); final ValidationStringency stringency = Utils.toStringency( parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "cat"); if (stringency == null) return 3; final Configuration conf = getConf(); // Expand the glob patterns. final List<Path> inputs = new ArrayList<Path>(ins.size()); for (final String in : ins) { try { final Path p = new Path(in); for (final FileStatus fstat : p.getFileSystem(conf).globStatus(p)) inputs.add(fstat.getPath()); } catch (IOException e) { System.err.printf("cat :: Could not expand glob pattern '%s': %s\n", in, e.getMessage()); } } final Path input0 = inputs.get(0); // Infer the format from the first input path or contents. // the first input path or contents. SAMFormat format = SAMFormat.inferFromFilePath(input0); if (format == null) { try { format = SAMFormat.inferFromData(input0.getFileSystem(conf).open(input0)); } catch (IOException e) { System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage()); return 4; } if (format == null) { System.err.printf("cat :: Unknown SAM format in input '%s'\n", inputs.get(0)); return 4; } } // Choose the header. final SAMFileHeader header; try { final SAMFileReader r = new SAMFileReader(input0.getFileSystem(conf).open(input0)); header = r.getFileHeader(); r.close(); } catch (IOException e) { System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage()); return 5; } // Open the output. final OutputStream out; try { out = outPath.getFileSystem(conf).create(outPath); } catch (IOException e) { System.err.printf("cat :: Could not create output file: %s\n", e.getMessage()); return 6; } // Output the header. try { // Don't use the returned stream, because we're concatenating directly // and don't want to apply another layer of compression to BAM. new SAMOutputPreparer().prepareForRecords(out, format, header); } catch (IOException e) { System.err.printf("cat :: Outputting header failed: %s\n", e.getMessage()); return 7; } // Output the records from each file in the order given, converting if // necessary. int inIdx = 1; try { for (final Path inPath : inputs) { if (verbose) { System.out.printf("cat :: Concatenating path %d of %d...\n", inIdx++, inputs.size()); } switch (format) { case SAM: { final InputStream in = inPath.getFileSystem(conf).open(inPath); // Use SAMFileReader to grab the header, but ignore it, thus // ensuring that the header has been skipped. new SAMFileReader(in).getFileHeader(); IOUtils.copyBytes(in, out, conf, false); in.close(); break; } case BAM: { final FSDataInputStream in = inPath.getFileSystem(conf).open(inPath); // Find the block length, thankfully given to us by the BGZF // format. We need it in order to know how much gzipped data to // read after skipping the BAM header, so that we can only read // that much and then simply copy the remaining gzip blocks // directly. final ByteBuffer block = ByteBuffer.wrap(new byte[0xffff]).order(ByteOrder.LITTLE_ENDIAN); // Don't use readFully here, since EOF is fine. for (int read = 0, prev; (prev = in.read(block.array(), read, block.capacity() - read)) < block .capacity();) { // EOF is fine. if (prev == -1) break; read += prev; } // Find the BGZF subfield and extract the length from it. int blockLength = 0; for (int xlen = (int) block.getShort(10) & 0xffff, i = 12, end = i + xlen; i < end;) { final int slen = (int) block.getShort(i + 2) & 0xffff; if (block.getShort(i) == 0x4342 && slen == 2) { blockLength = ((int) block.getShort(i + 4) & 0xffff) + 1; break; } i += 4 + slen; } if (blockLength == 0) throw new IOException("BGZF extra field not found in " + inPath); if (verbose) { System.err.printf("cat :: first block length %d\n", blockLength); } // Skip the BAM header. Can't use SAMFileReader because it'll // use its own BlockCompressedInputStream. final ByteArrayInputStream blockIn = new ByteArrayInputStream(block.array(), 0, blockLength); final BlockCompressedInputStream bin = new BlockCompressedInputStream(blockIn); // Theoretically we could write into the ByteBuffer we already // had, since BlockCompressedInputStream needs to read the // header before it can decompress any data and thereafter we // can freely overwrite the first 8 bytes of the header... but // that's a bit too nasty, so let's not. final ByteBuffer buf = ByteBuffer.wrap(new byte[8]).order(ByteOrder.LITTLE_ENDIAN); // Read the BAM magic number and the SAM header length, verify // the magic, and skip the SAM header. IOUtils.readFully(bin, buf.array(), 0, 8); final int magic = buf.getInt(0), headerLen = buf.getInt(4); if (magic != 0x014d4142) throw new IOException("bad BAM magic number in " + inPath); IOUtils.skipFully(bin, headerLen); // Skip the reference sequences. IOUtils.readFully(bin, buf.array(), 0, 4); for (int i = buf.getInt(0); i-- > 0;) { // Read the reference name length and skip it along with the // reference length. IOUtils.readFully(bin, buf.array(), 0, 4); IOUtils.skipFully(bin, buf.getInt(0) + 4); } // Recompress the rest of this gzip block. final int remaining = bin.available(); if (verbose) System.err.printf("cat :: %d bytes to bgzip\n", remaining); if (remaining > 0) { // The overload of IOUtils.copyBytes that takes "long length" // was added only in Hadoop 0.20.205.0, which we don't want // to depend on, so copy manually. final byte[] remBuf = new byte[remaining]; IOUtils.readFully(bin, remBuf, 0, remBuf.length); final BlockCompressedOutputStream bout = new BlockCompressedOutputStream(out, null); bout.write(remBuf); bout.flush(); } // Just copy the raw bytes comprising the remaining blocks. in.seek(blockLength); IOUtils.copyBytes(in, out, conf, false); in.close(); break; } } } } catch (IOException e) { System.err.printf("cat :: Outputting records failed: %s\n", e.getMessage()); return 8; } // For BAM, output the BGZF terminator. try { if (format == SAMFormat.BAM) out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); out.close(); } catch (IOException e) { System.err.printf("cat :: Finishing output failed: %s\n", e.getMessage()); return 9; } return 0; }