List of usage examples for org.apache.hadoop.io IOUtils skipFully
public static void skipFully(InputStream in, long len) throws IOException
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.Cat.java
License:Open Source License
@Override protected int run(final CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("cat :: OUTPATH not given."); return 3; }/*from w w w.j av a 2s.c om*/ if (args.size() == 1) { System.err.println("cat :: no INPATHs given."); return 3; } final Path outPath = new Path(args.get(0)); final List<String> ins = args.subList(1, args.size()); final boolean verbose = parser.getBoolean(verboseOpt); final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue( stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "cat"); if (stringency == null) return 3; final Configuration conf = getConf(); // Expand the glob patterns. final List<Path> inputs = new ArrayList<Path>(ins.size()); for (final String in : ins) { try { final Path p = new Path(in); for (final FileStatus fstat : p.getFileSystem(conf).globStatus(p)) inputs.add(fstat.getPath()); } catch (IOException e) { System.err.printf("cat :: Could not expand glob pattern '%s': %s\n", in, e.getMessage()); } } final Path input0 = inputs.get(0); // Infer the format from the first input path or contents. // the first input path or contents. SAMFormat format = SAMFormat.inferFromFilePath(input0); if (format == null) { try { format = SAMFormat.inferFromData(input0.getFileSystem(conf).open(input0)); } catch (IOException e) { System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage()); return 4; } if (format == null) { System.err.printf("cat :: Unknown SAM format in input '%s'\n", inputs.get(0)); return 4; } } // Choose the header. final SAMFileHeader header; try { final SAMFileReader r = new SAMFileReader(input0.getFileSystem(conf).open(input0)); header = r.getFileHeader(); r.close(); } catch (IOException e) { System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage()); return 5; } // Open the output. final OutputStream out; try { out = outPath.getFileSystem(conf).create(outPath); } catch (IOException e) { System.err.printf("cat :: Could not create output file: %s\n", e.getMessage()); return 6; } // Output the header. try { // Don't use the returned stream, because we're concatenating directly // and don't want to apply another layer of compression to BAM. new SAMOutputPreparer().prepareForRecords(out, format, header); } catch (IOException e) { System.err.printf("cat :: Outputting header failed: %s\n", e.getMessage()); return 7; } // Output the records from each file in the order given, converting if // necessary. int inIdx = 1; try { for (final Path inPath : inputs) { if (verbose) { System.out.printf("cat :: Concatenating path %d of %d...\n", inIdx++, inputs.size()); } switch (format) { case SAM: { final InputStream in = inPath.getFileSystem(conf).open(inPath); // Use SAMFileReader to grab the header, but ignore it, thus // ensuring that the header has been skipped. new SAMFileReader(in).getFileHeader(); IOUtils.copyBytes(in, out, conf, false); in.close(); break; } case BAM: { final FSDataInputStream in = inPath.getFileSystem(conf).open(inPath); // Find the block length, thankfully given to us by the BGZF // format. We need it in order to know how much gzipped data to // read after skipping the BAM header, so that we can only read // that much and then simply copy the remaining gzip blocks // directly. final ByteBuffer block = ByteBuffer.wrap(new byte[0xffff]).order(ByteOrder.LITTLE_ENDIAN); // Don't use readFully here, since EOF is fine. for (int read = 0, prev; (prev = in.read(block.array(), read, block.capacity() - read)) < block .capacity();) { // EOF is fine. if (prev == -1) break; read += prev; } // Find the BGZF subfield and extract the length from it. int blockLength = 0; for (int xlen = (int) block.getShort(10) & 0xffff, i = 12, end = i + xlen; i < end;) { final int slen = (int) block.getShort(i + 2) & 0xffff; if (block.getShort(i) == 0x4342 && slen == 2) { blockLength = ((int) block.getShort(i + 4) & 0xffff) + 1; break; } i += 4 + slen; } if (blockLength == 0) throw new IOException("BGZF extra field not found in " + inPath); if (verbose) { System.err.printf("cat :: first block length %d\n", blockLength); } // Skip the BAM header. Can't use SAMFileReader because it'll // use its own BlockCompressedInputStream. final ByteArrayInputStream blockIn = new ByteArrayInputStream(block.array(), 0, blockLength); final BlockCompressedInputStream bin = new BlockCompressedInputStream(blockIn); // Theoretically we could write into the ByteBuffer we already // had, since BlockCompressedInputStream needs to read the // header before it can decompress any data and thereafter we // can freely overwrite the first 8 bytes of the header... but // that's a bit too nasty, so let's not. final ByteBuffer buf = ByteBuffer.wrap(new byte[8]).order(ByteOrder.LITTLE_ENDIAN); // Read the BAM magic number and the SAM header length, verify // the magic, and skip the SAM header. IOUtils.readFully(bin, buf.array(), 0, 8); final int magic = buf.getInt(0), headerLen = buf.getInt(4); if (magic != 0x014d4142) throw new IOException("bad BAM magic number in " + inPath); IOUtils.skipFully(bin, headerLen); // Skip the reference sequences. IOUtils.readFully(bin, buf.array(), 0, 4); for (int i = buf.getInt(0); i-- > 0;) { // Read the reference name length and skip it along with the // reference length. IOUtils.readFully(bin, buf.array(), 0, 4); IOUtils.skipFully(bin, buf.getInt(0) + 4); } // Recompress the rest of this gzip block. final int remaining = bin.available(); if (verbose) System.err.printf("cat :: %d bytes to bgzip\n", remaining); if (remaining > 0) { // The overload of IOUtils.copyBytes that takes "long length" // was added only in Hadoop 0.20.205.0, which we don't want // to depend on, so copy manually. final byte[] remBuf = new byte[remaining]; IOUtils.readFully(bin, remBuf, 0, remBuf.length); final BlockCompressedOutputStream bout = new BlockCompressedOutputStream(out, null); bout.write(remBuf); bout.flush(); } // Just copy the raw bytes comprising the remaining blocks. in.seek(blockLength); IOUtils.copyBytes(in, out, conf, false); in.close(); break; } } } } catch (IOException e) { System.err.printf("cat :: Outputting records failed: %s\n", e.getMessage()); return 8; } // For BAM, output the BGZF terminator. try { if (format == SAMFormat.BAM) out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); out.close(); } catch (IOException e) { System.err.printf("cat :: Finishing output failed: %s\n", e.getMessage()); return 9; } return 0; }
From source file:org.seqdoop.hadoop_bam.cli.plugins.Cat.java
License:Open Source License
@Override protected int run(final CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("cat :: OUTPATH not given."); return 3; }//from w ww .ja v a2 s. c om if (args.size() == 1) { System.err.println("cat :: no INPATHs given."); return 3; } final Path outPath = new Path(args.get(0)); final List<String> ins = args.subList(1, args.size()); final boolean verbose = parser.getBoolean(verboseOpt); final ValidationStringency stringency = Utils.toStringency( parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "cat"); if (stringency == null) return 3; final Configuration conf = getConf(); // Expand the glob patterns. final List<Path> inputs = new ArrayList<Path>(ins.size()); for (final String in : ins) { try { final Path p = new Path(in); for (final FileStatus fstat : p.getFileSystem(conf).globStatus(p)) inputs.add(fstat.getPath()); } catch (IOException e) { System.err.printf("cat :: Could not expand glob pattern '%s': %s\n", in, e.getMessage()); } } final Path input0 = inputs.get(0); // Infer the format from the first input path or contents. // the first input path or contents. SAMFormat format = SAMFormat.inferFromFilePath(input0); if (format == null) { try { format = SAMFormat.inferFromData(input0.getFileSystem(conf).open(input0)); } catch (IOException e) { System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage()); return 4; } if (format == null) { System.err.printf("cat :: Unknown SAM format in input '%s'\n", inputs.get(0)); return 4; } } // Choose the header. final SAMFileHeader header; try { final SAMFileReader r = new SAMFileReader(input0.getFileSystem(conf).open(input0)); header = r.getFileHeader(); r.close(); } catch (IOException e) { System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage()); return 5; } // Open the output. final OutputStream out; try { out = outPath.getFileSystem(conf).create(outPath); } catch (IOException e) { System.err.printf("cat :: Could not create output file: %s\n", e.getMessage()); return 6; } // Output the header. try { // Don't use the returned stream, because we're concatenating directly // and don't want to apply another layer of compression to BAM. new SAMOutputPreparer().prepareForRecords(out, format, header); } catch (IOException e) { System.err.printf("cat :: Outputting header failed: %s\n", e.getMessage()); return 7; } // Output the records from each file in the order given, converting if // necessary. int inIdx = 1; try { for (final Path inPath : inputs) { if (verbose) { System.out.printf("cat :: Concatenating path %d of %d...\n", inIdx++, inputs.size()); } switch (format) { case SAM: { final InputStream in = inPath.getFileSystem(conf).open(inPath); // Use SAMFileReader to grab the header, but ignore it, thus // ensuring that the header has been skipped. new SAMFileReader(in).getFileHeader(); IOUtils.copyBytes(in, out, conf, false); in.close(); break; } case BAM: { final FSDataInputStream in = inPath.getFileSystem(conf).open(inPath); // Find the block length, thankfully given to us by the BGZF // format. We need it in order to know how much gzipped data to // read after skipping the BAM header, so that we can only read // that much and then simply copy the remaining gzip blocks // directly. final ByteBuffer block = ByteBuffer.wrap(new byte[0xffff]).order(ByteOrder.LITTLE_ENDIAN); // Don't use readFully here, since EOF is fine. for (int read = 0, prev; (prev = in.read(block.array(), read, block.capacity() - read)) < block .capacity();) { // EOF is fine. if (prev == -1) break; read += prev; } // Find the BGZF subfield and extract the length from it. int blockLength = 0; for (int xlen = (int) block.getShort(10) & 0xffff, i = 12, end = i + xlen; i < end;) { final int slen = (int) block.getShort(i + 2) & 0xffff; if (block.getShort(i) == 0x4342 && slen == 2) { blockLength = ((int) block.getShort(i + 4) & 0xffff) + 1; break; } i += 4 + slen; } if (blockLength == 0) throw new IOException("BGZF extra field not found in " + inPath); if (verbose) { System.err.printf("cat :: first block length %d\n", blockLength); } // Skip the BAM header. Can't use SAMFileReader because it'll // use its own BlockCompressedInputStream. final ByteArrayInputStream blockIn = new ByteArrayInputStream(block.array(), 0, blockLength); final BlockCompressedInputStream bin = new BlockCompressedInputStream(blockIn); // Theoretically we could write into the ByteBuffer we already // had, since BlockCompressedInputStream needs to read the // header before it can decompress any data and thereafter we // can freely overwrite the first 8 bytes of the header... but // that's a bit too nasty, so let's not. final ByteBuffer buf = ByteBuffer.wrap(new byte[8]).order(ByteOrder.LITTLE_ENDIAN); // Read the BAM magic number and the SAM header length, verify // the magic, and skip the SAM header. IOUtils.readFully(bin, buf.array(), 0, 8); final int magic = buf.getInt(0), headerLen = buf.getInt(4); if (magic != 0x014d4142) throw new IOException("bad BAM magic number in " + inPath); IOUtils.skipFully(bin, headerLen); // Skip the reference sequences. IOUtils.readFully(bin, buf.array(), 0, 4); for (int i = buf.getInt(0); i-- > 0;) { // Read the reference name length and skip it along with the // reference length. IOUtils.readFully(bin, buf.array(), 0, 4); IOUtils.skipFully(bin, buf.getInt(0) + 4); } // Recompress the rest of this gzip block. final int remaining = bin.available(); if (verbose) System.err.printf("cat :: %d bytes to bgzip\n", remaining); if (remaining > 0) { // The overload of IOUtils.copyBytes that takes "long length" // was added only in Hadoop 0.20.205.0, which we don't want // to depend on, so copy manually. final byte[] remBuf = new byte[remaining]; IOUtils.readFully(bin, remBuf, 0, remBuf.length); final BlockCompressedOutputStream bout = new BlockCompressedOutputStream(out, null); bout.write(remBuf); bout.flush(); } // Just copy the raw bytes comprising the remaining blocks. in.seek(blockLength); IOUtils.copyBytes(in, out, conf, false); in.close(); break; } } } } catch (IOException e) { System.err.printf("cat :: Outputting records failed: %s\n", e.getMessage()); return 8; } // For BAM, output the BGZF terminator. try { if (format == SAMFormat.BAM) out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); out.close(); } catch (IOException e) { System.err.printf("cat :: Finishing output failed: %s\n", e.getMessage()); return 9; } return 0; }