Example usage for org.apache.hadoop.io IOUtils skipFully

Introduction

In this page you can find the example usage for org.apache.hadoop.io IOUtils skipFully.

Prototype

public static void skipFully(InputStream in, long len) throws IOException

Source Link

Document

Usage

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.Cat.java

License:Open Source License

@Override
protected int run(final CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("cat :: OUTPATH not given.");
        return 3;
    }/*from w w w.j  av  a 2s.c om*/
    if (args.size() == 1) {
        System.err.println("cat :: no INPATHs given.");
        return 3;
    }

    final Path outPath = new Path(args.get(0));

    final List<String> ins = args.subList(1, args.size());

    final boolean verbose = parser.getBoolean(verboseOpt);

    final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue(
            stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "cat");
    if (stringency == null)
        return 3;

    final Configuration conf = getConf();

    // Expand the glob patterns.

    final List<Path> inputs = new ArrayList<Path>(ins.size());
    for (final String in : ins) {
        try {
            final Path p = new Path(in);
            for (final FileStatus fstat : p.getFileSystem(conf).globStatus(p))
                inputs.add(fstat.getPath());
        } catch (IOException e) {
            System.err.printf("cat :: Could not expand glob pattern '%s': %s\n", in, e.getMessage());
        }
    }

    final Path input0 = inputs.get(0);

    // Infer the format from the first input path or contents.
    // the first input path or contents.

    SAMFormat format = SAMFormat.inferFromFilePath(input0);
    if (format == null) {
        try {
            format = SAMFormat.inferFromData(input0.getFileSystem(conf).open(input0));
        } catch (IOException e) {
            System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage());
            return 4;
        }
        if (format == null) {
            System.err.printf("cat :: Unknown SAM format in input '%s'\n", inputs.get(0));
            return 4;
        }
    }

    // Choose the header.

    final SAMFileHeader header;
    try {
        final SAMFileReader r = new SAMFileReader(input0.getFileSystem(conf).open(input0));

        header = r.getFileHeader();
        r.close();
    } catch (IOException e) {
        System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage());
        return 5;
    }

    // Open the output.

    final OutputStream out;

    try {
        out = outPath.getFileSystem(conf).create(outPath);
    } catch (IOException e) {
        System.err.printf("cat :: Could not create output file: %s\n", e.getMessage());
        return 6;
    }

    // Output the header.

    try {
        // Don't use the returned stream, because we're concatenating directly
        // and don't want to apply another layer of compression to BAM.
        new SAMOutputPreparer().prepareForRecords(out, format, header);

    } catch (IOException e) {
        System.err.printf("cat :: Outputting header failed: %s\n", e.getMessage());
        return 7;
    }

    // Output the records from each file in the order given, converting if
    // necessary.

    int inIdx = 1;
    try {
        for (final Path inPath : inputs) {
            if (verbose) {
                System.out.printf("cat :: Concatenating path %d of %d...\n", inIdx++, inputs.size());
            }
            switch (format) {
            case SAM: {
                final InputStream in = inPath.getFileSystem(conf).open(inPath);

                // Use SAMFileReader to grab the header, but ignore it, thus
                // ensuring that the header has been skipped.
                new SAMFileReader(in).getFileHeader();

                IOUtils.copyBytes(in, out, conf, false);
                in.close();
                break;
            }
            case BAM: {
                final FSDataInputStream in = inPath.getFileSystem(conf).open(inPath);

                // Find the block length, thankfully given to us by the BGZF
                // format. We need it in order to know how much gzipped data to
                // read after skipping the BAM header, so that we can only read
                // that much and then simply copy the remaining gzip blocks
                // directly.

                final ByteBuffer block = ByteBuffer.wrap(new byte[0xffff]).order(ByteOrder.LITTLE_ENDIAN);

                // Don't use readFully here, since EOF is fine.
                for (int read = 0, prev; (prev = in.read(block.array(), read, block.capacity() - read)) < block
                        .capacity();) {
                    // EOF is fine.
                    if (prev == -1)
                        break;
                    read += prev;
                }

                // Find the BGZF subfield and extract the length from it.
                int blockLength = 0;
                for (int xlen = (int) block.getShort(10) & 0xffff, i = 12, end = i + xlen; i < end;) {
                    final int slen = (int) block.getShort(i + 2) & 0xffff;
                    if (block.getShort(i) == 0x4342 && slen == 2) {
                        blockLength = ((int) block.getShort(i + 4) & 0xffff) + 1;
                        break;
                    }
                    i += 4 + slen;
                }
                if (blockLength == 0)
                    throw new IOException("BGZF extra field not found in " + inPath);

                if (verbose) {
                    System.err.printf("cat ::   first block length %d\n", blockLength);
                }

                // Skip the BAM header. Can't use SAMFileReader because it'll
                // use its own BlockCompressedInputStream.

                final ByteArrayInputStream blockIn = new ByteArrayInputStream(block.array(), 0, blockLength);

                final BlockCompressedInputStream bin = new BlockCompressedInputStream(blockIn);

                // Theoretically we could write into the ByteBuffer we already
                // had, since BlockCompressedInputStream needs to read the
                // header before it can decompress any data and thereafter we
                // can freely overwrite the first 8 bytes of the header... but
                // that's a bit too nasty, so let's not.
                final ByteBuffer buf = ByteBuffer.wrap(new byte[8]).order(ByteOrder.LITTLE_ENDIAN);

                // Read the BAM magic number and the SAM header length, verify
                // the magic, and skip the SAM header.

                IOUtils.readFully(bin, buf.array(), 0, 8);

                final int magic = buf.getInt(0), headerLen = buf.getInt(4);

                if (magic != 0x014d4142)
                    throw new IOException("bad BAM magic number in " + inPath);

                IOUtils.skipFully(bin, headerLen);

                // Skip the reference sequences.

                IOUtils.readFully(bin, buf.array(), 0, 4);

                for (int i = buf.getInt(0); i-- > 0;) {
                    // Read the reference name length and skip it along with the
                    // reference length.
                    IOUtils.readFully(bin, buf.array(), 0, 4);
                    IOUtils.skipFully(bin, buf.getInt(0) + 4);
                }

                // Recompress the rest of this gzip block.

                final int remaining = bin.available();

                if (verbose)
                    System.err.printf("cat ::   %d bytes to bgzip\n", remaining);

                if (remaining > 0) {
                    // The overload of IOUtils.copyBytes that takes "long length"
                    // was added only in Hadoop 0.20.205.0, which we don't want
                    // to depend on, so copy manually.
                    final byte[] remBuf = new byte[remaining];
                    IOUtils.readFully(bin, remBuf, 0, remBuf.length);

                    final BlockCompressedOutputStream bout = new BlockCompressedOutputStream(out, null);

                    bout.write(remBuf);
                    bout.flush();
                }

                // Just copy the raw bytes comprising the remaining blocks.

                in.seek(blockLength);
                IOUtils.copyBytes(in, out, conf, false);
                in.close();
                break;
            }
            }
        }
    } catch (IOException e) {
        System.err.printf("cat :: Outputting records failed: %s\n", e.getMessage());
        return 8;
    }

    // For BAM, output the BGZF terminator.

    try {
        if (format == SAMFormat.BAM)
            out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

        out.close();
    } catch (IOException e) {
        System.err.printf("cat :: Finishing output failed: %s\n", e.getMessage());
        return 9;
    }
    return 0;
}

From source file:org.seqdoop.hadoop_bam.cli.plugins.Cat.java

License:Open Source License

@Override
protected int run(final CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("cat :: OUTPATH not given.");
        return 3;
    }//from   w  ww  .ja  v  a2  s.  c om
    if (args.size() == 1) {
        System.err.println("cat :: no INPATHs given.");
        return 3;
    }

    final Path outPath = new Path(args.get(0));

    final List<String> ins = args.subList(1, args.size());

    final boolean verbose = parser.getBoolean(verboseOpt);

    final ValidationStringency stringency = Utils.toStringency(
            parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "cat");
    if (stringency == null)
        return 3;

    final Configuration conf = getConf();

    // Expand the glob patterns.

    final List<Path> inputs = new ArrayList<Path>(ins.size());
    for (final String in : ins) {
        try {
            final Path p = new Path(in);
            for (final FileStatus fstat : p.getFileSystem(conf).globStatus(p))
                inputs.add(fstat.getPath());
        } catch (IOException e) {
            System.err.printf("cat :: Could not expand glob pattern '%s': %s\n", in, e.getMessage());
        }
    }

    final Path input0 = inputs.get(0);

    // Infer the format from the first input path or contents.
    // the first input path or contents.

    SAMFormat format = SAMFormat.inferFromFilePath(input0);
    if (format == null) {
        try {
            format = SAMFormat.inferFromData(input0.getFileSystem(conf).open(input0));
        } catch (IOException e) {
            System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage());
            return 4;
        }
        if (format == null) {
            System.err.printf("cat :: Unknown SAM format in input '%s'\n", inputs.get(0));
            return 4;
        }
    }

    // Choose the header.

    final SAMFileHeader header;
    try {
        final SAMFileReader r = new SAMFileReader(input0.getFileSystem(conf).open(input0));

        header = r.getFileHeader();
        r.close();
    } catch (IOException e) {
        System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage());
        return 5;
    }

    // Open the output.

    final OutputStream out;

    try {
        out = outPath.getFileSystem(conf).create(outPath);
    } catch (IOException e) {
        System.err.printf("cat :: Could not create output file: %s\n", e.getMessage());
        return 6;
    }

    // Output the header.

    try {
        // Don't use the returned stream, because we're concatenating directly
        // and don't want to apply another layer of compression to BAM.
        new SAMOutputPreparer().prepareForRecords(out, format, header);

    } catch (IOException e) {
        System.err.printf("cat :: Outputting header failed: %s\n", e.getMessage());
        return 7;
    }

    // Output the records from each file in the order given, converting if
    // necessary.

    int inIdx = 1;
    try {
        for (final Path inPath : inputs) {
            if (verbose) {
                System.out.printf("cat :: Concatenating path %d of %d...\n", inIdx++, inputs.size());
            }
            switch (format) {
            case SAM: {
                final InputStream in = inPath.getFileSystem(conf).open(inPath);

                // Use SAMFileReader to grab the header, but ignore it, thus
                // ensuring that the header has been skipped.
                new SAMFileReader(in).getFileHeader();

                IOUtils.copyBytes(in, out, conf, false);
                in.close();
                break;
            }
            case BAM: {
                final FSDataInputStream in = inPath.getFileSystem(conf).open(inPath);

                // Find the block length, thankfully given to us by the BGZF
                // format. We need it in order to know how much gzipped data to
                // read after skipping the BAM header, so that we can only read
                // that much and then simply copy the remaining gzip blocks
                // directly.

                final ByteBuffer block = ByteBuffer.wrap(new byte[0xffff]).order(ByteOrder.LITTLE_ENDIAN);

                // Don't use readFully here, since EOF is fine.
                for (int read = 0, prev; (prev = in.read(block.array(), read, block.capacity() - read)) < block
                        .capacity();) {
                    // EOF is fine.
                    if (prev == -1)
                        break;
                    read += prev;
                }

                // Find the BGZF subfield and extract the length from it.
                int blockLength = 0;
                for (int xlen = (int) block.getShort(10) & 0xffff, i = 12, end = i + xlen; i < end;) {
                    final int slen = (int) block.getShort(i + 2) & 0xffff;
                    if (block.getShort(i) == 0x4342 && slen == 2) {
                        blockLength = ((int) block.getShort(i + 4) & 0xffff) + 1;
                        break;
                    }
                    i += 4 + slen;
                }
                if (blockLength == 0)
                    throw new IOException("BGZF extra field not found in " + inPath);

                if (verbose) {
                    System.err.printf("cat ::   first block length %d\n", blockLength);
                }

                // Skip the BAM header. Can't use SAMFileReader because it'll
                // use its own BlockCompressedInputStream.

                final ByteArrayInputStream blockIn = new ByteArrayInputStream(block.array(), 0, blockLength);

                final BlockCompressedInputStream bin = new BlockCompressedInputStream(blockIn);

                // Theoretically we could write into the ByteBuffer we already
                // had, since BlockCompressedInputStream needs to read the
                // header before it can decompress any data and thereafter we
                // can freely overwrite the first 8 bytes of the header... but
                // that's a bit too nasty, so let's not.
                final ByteBuffer buf = ByteBuffer.wrap(new byte[8]).order(ByteOrder.LITTLE_ENDIAN);

                // Read the BAM magic number and the SAM header length, verify
                // the magic, and skip the SAM header.

                IOUtils.readFully(bin, buf.array(), 0, 8);

                final int magic = buf.getInt(0), headerLen = buf.getInt(4);

                if (magic != 0x014d4142)
                    throw new IOException("bad BAM magic number in " + inPath);

                IOUtils.skipFully(bin, headerLen);

                // Skip the reference sequences.

                IOUtils.readFully(bin, buf.array(), 0, 4);

                for (int i = buf.getInt(0); i-- > 0;) {
                    // Read the reference name length and skip it along with the
                    // reference length.
                    IOUtils.readFully(bin, buf.array(), 0, 4);
                    IOUtils.skipFully(bin, buf.getInt(0) + 4);
                }

                // Recompress the rest of this gzip block.

                final int remaining = bin.available();

                if (verbose)
                    System.err.printf("cat ::   %d bytes to bgzip\n", remaining);

                if (remaining > 0) {
                    // The overload of IOUtils.copyBytes that takes "long length"
                    // was added only in Hadoop 0.20.205.0, which we don't want
                    // to depend on, so copy manually.
                    final byte[] remBuf = new byte[remaining];
                    IOUtils.readFully(bin, remBuf, 0, remBuf.length);

                    final BlockCompressedOutputStream bout = new BlockCompressedOutputStream(out, null);

                    bout.write(remBuf);
                    bout.flush();
                }

                // Just copy the raw bytes comprising the remaining blocks.

                in.seek(blockLength);
                IOUtils.copyBytes(in, out, conf, false);
                in.close();
                break;
            }
            }
        }
    } catch (IOException e) {
        System.err.printf("cat :: Outputting records failed: %s\n", e.getMessage());
        return 8;
    }

    // For BAM, output the BGZF terminator.

    try {
        if (format == SAMFormat.BAM)
            out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

        out.close();
    } catch (IOException e) {
        System.err.printf("cat :: Finishing output failed: %s\n", e.getMessage());
        return 9;
    }
    return 0;
}