Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:com.hadoop.mapred.DeprecatedLzoLineRecordReader.java

License:Open Source License

DeprecatedLzoLineRecordReader(Configuration conf, FileSplit split) throws IOException {
    start = split.getStart();//w ww. jav a  2  s. c om
    end = start + split.getLength();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(conf);
    codecFactory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = codecFactory.getCodec(file);
    if (codec == null) {
        throw new IOException("No LZO codec found, cannot run.");
    }

    // Open the file and seek to the next split.
    fileIn = fs.open(file);
    // Create input stream and read the file header.
    in = new LineReader(codec.createInputStream(fileIn), conf);
    if (start != 0) {
        fileIn.seek(start);

        // Read and ignore the first line.
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    pos = start;
}

From source file:com.hadoop.mapreduce.FourMcLineRecordReader.java

License:BSD License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();//from   w w w . ja  v a  2s .  com
    end = start + split.getLength();
    final Path file = split.getPath();
    Configuration job = HadoopUtils.getConfiguration(context);
    maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("Codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:com.hadoop.mapreduce.LzoLineRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();//from   w  w  w . j a  v a2 s .  c  om
    end = start + split.getLength();
    final Path file = split.getPath();
    Configuration job = context.getConfiguration();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("No codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java

License:Apache License

/**
 * Verifies that the work dir has the expected output.
 *///  www .ja v a 2 s .  c om
private void verifyOutput(String dir, String crushOutMask, Format inFmt, Format outFmt, CompressionCodec codec,
        String... fileNames) throws IOException {

    /*
     * Read format table
     *
     *         \   out format
     *          \
     * in format \ seq    | text
     * ----------------------------
     *      seq  | Custom | ascii |
     * -------------------------- -
     *      text | Text   | ascii |
     * ----------------------------
     */

    if (Format.TEXT == outFmt) {
        /*
         * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want.
         * We want to preserve the actual keys and values in the files, just like SequenceFileInputFormat. So, either way, the
         * keys and values should be the text representations of what went in.
         */
        BufferedReader reader;
        Path crushOut;

        if (null == codec) {
            Path path = new Path(dir + "/" + crushOutMask);

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask + " was not found in " + path);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(new InputStreamReader(getFileSystem().open(crushOut)));
        } else {
            Path path = new Path(dir + "/" + crushOutMask + codec.getDefaultExtension());

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(
                    new InputStreamReader(codec.createInputStream(getFileSystem().open(crushOut))));
        }

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) {
                String expectedLine = String.format("%d\t%d", key, value);
                assertThat(expectedLine, expected.add(expectedLine), is(true));

                String actualLine = reader.readLine();
                assertThat(actualLine, actual.add(actualLine), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.readLine(), nullValue());

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) {
        /*
         * Record reader will produce keys that are custom writables and values that are custom writable.
         */
        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        CustomWritable key = new CustomWritable();
        CustomWritable value = new CustomWritable();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) {

        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isCompressed(), is(true));

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        Text key = new Text();
        Text value = new Text();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else {
        fail();
    }
}

From source file:com.inmobi.conduit.CompressedFileReaderTest.java

License:Apache License

private void uncompress(String fileName) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs;/*from w  ww  .ja va  2s  .  c om*/
    fs = FileSystem.getLocal(conf);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(new Path(fileName));
    if (codec == null) {
        System.out.println("cant find codec");
        System.exit(1);
    }
    LOG.info("Using compression codec [" + codec.toString() + "]");
    CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName)));
    OutputStream out = null;
    try {
        String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension());
        out = fs.create(new Path(outputURI + "-uncompressed"));
        org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf);
    } finally {
        org.apache.hadoop.io.IOUtils.closeStream(out);
        IOUtils.closeStream(is);

    }
}

From source file:com.inmobi.conduit.distcp.tools.mapred.RetriableFileCopyCommand.java

License:Apache License

private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream, int bufferSize,
        Mapper.Context context, Map<Long, Long> received) throws IOException {
    Path source = sourceFileStatus.getPath();
    ThrottledInputStream inStream = null;
    final CompressionCodec codec = compressionCodecs.getCodec(source);
    InputStream compressedIn = null;
    OutputStream commpressedOut = null;
    BufferedReader reader = null;
    long numberOfLinesRead = 0;

    try {//from w  w  w  .  j  a  v  a  2s .  c o  m
        inStream = getInputStream(source, HadoopCompat.getTaskConfiguration(context));
        compressedIn = codec.createInputStream(inStream);
        commpressedOut = codec.createOutputStream(outStream);
        // LineReader reader = new LineReader(compressedIn,
        // context.getConfiguration(), null);
        reader = new BufferedReader(new InputStreamReader(compressedIn));
        byte[] bytesRead = readLine(reader);
        while (bytesRead != null) {
            numberOfLinesRead++;
            commpressedOut.write(bytesRead);
            commpressedOut.write("\n".getBytes());
            updateContextStatus(inStream.getTotalBytesRead(), context, sourceFileStatus, numberOfLinesRead);
            if (received != null) {
                byte[] decodedMsg = Base64.decodeBase64(bytesRead);
                incrementReceived(decodedMsg, received);
            }
            bytesRead = readLine(reader);
        }
        HadoopCompat.incrementCounter(HadoopCompat.getCounter(context, CopyMapper.Counter.SLEEP_TIME_MS),
                inStream.getTotalSleepTime());
        LOG.info("STATS: " + inStream);
    } finally {
        IOUtils.cleanup(LOG, inStream, reader, compressedIn);
        try {
            if (commpressedOut != null)
                commpressedOut.close();
            outStream.close();
        } catch (IOException exception) {
            LOG.error("Could not close output-stream. ", exception);
            throw exception;
        }
    }

    return inStream.getTotalBytesRead();// totalBytesRead;
}

From source file:com.jeffy.hdfs.compression.FileDecompressor.java

License:Apache License

/**
 * @param args//w w w  .  ja  va  2 s  .c  o m
 *            
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    //??
    Configuration conf = new Configuration();
    // ?
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    for (String uri : args) {
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        Path inputPath = new Path(uri);
        // ??????io.compression.codecs
        CompressionCodec codec = factory.getCodec(inputPath);
        // ??
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            continue;
        }
        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
        try (InputStream in = codec.createInputStream(fs.open(inputPath));
                OutputStream out = fs.create(new Path(outputUri))) {
            IOUtils.copyBytes(in, out, conf);
        }
    }
}

From source file:com.knewton.mrtool.io.JsonRecordReader.java

License:Apache License

/**
 * Get the line reader to be used for the file. A <code>LineReader</code> can read a file line
 * by line. This separate method helps with testing too.
 * // w  w w .  j ava 2 s.  c om
 * @param fileSplit
 * @param conf
 * @return
 * @throws IOException
 */
protected LineReader initLineReader(FileSplit fileSplit, Configuration conf) throws IOException {
    final Path file = fileSplit.getPath();
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(fileSplit.getPath());
    seekableIn = fileIn;
    boolean skipFirstLine = false;
    LineReader lineReader;
    if (codec != null) {
        lineReader = new LineReader(codec.createInputStream(fileIn), conf);
    } else {
        // if the start is not the beginning of the file then skip the first line to get the
        // next complete json record. The previous json record will be read by the record reader
        // that got assigned the previous InputSplit.
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new LineReader(fileIn, conf);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    return lineReader;
}

From source file:com.linkedin.cubert.block.BlockUtils.java

License:Open Source License

@SuppressWarnings("unchecked")
public static Block loadBlock(BlockProperties props, IndexEntry indexEntry, Configuration conf, JsonNode json,
        BlockSerializationType serializationType, boolean isInMemoryBlock) throws IOException,
        ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException {
    Block block;//from   w  w w  . j a  va  2s .c  o  m
    if (indexEntry == null) {
        if (emptyForMissing)
            return new EmptyBlock(props);

        throw new IOException(String.format("Index entry is null"));
    }

    // populate props
    props.setBlockId(indexEntry.getBlockId());
    props.setNumRecords(indexEntry.getNumRecords());

    // Open the file and seek to the offset for this block
    Path file = new Path(indexEntry.getFile());
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fsin = fs.open(file, BLOCK_BUFFER_SIZE);
    fsin.seek(indexEntry.getOffset());

    // Gather information needed to read this block
    Class<Tuple> valueClass = (Class<Tuple>) TupleFactory.getInstance().newTuple().getClass();
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);

    // Load the block now
    if (isInMemoryBlock) {
        print.f("LOADING IN MEMORY the block %d", indexEntry.getBlockId());

        ByteBuffer byteBuffer = inMemoryBlockCache.get(indexEntry);

        if (byteBuffer == null) {
            int read = 0;
            byte[] data = new byte[(int) indexEntry.getLength()];
            while (read != data.length) {
                read += fsin.read(data, read, data.length - read);
            }
            fsin.close();

            byteBuffer = ByteBuffer.wrap(data);

            inMemoryBlockCache.put(indexEntry, byteBuffer);
        } else {
            print.f("REUSED FROM CACHE!!");
            byteBuffer.rewind();
        }

        block = new RubixMemoryBlock(props, conf, byteBuffer, valueClass, codec, serializationType);
        block.configure(json);
        return block;
    } else {
        print.f("STREAMING the block %d", indexEntry.getBlockId());
        InputStream in = new BlockInputStream(fsin, indexEntry.getLength());

        if (codec != null) {
            in = codec.createInputStream(in);
        }

        block = new CubertBlock(props,
                new BlockIterator<Tuple>(conf, in, valueClass, serializationType, props.getSchema()));
        block.configure(json);

        print.f("Loaded block id=%d from file=%s offset=%d length=%d", indexEntry.getBlockId(), file.toString(),
                indexEntry.getOffset(), indexEntry.getLength());

        return block;
    }
}

From source file:com.linkedin.cubert.io.rubix.RubixMemoryBlock.java

License:Open Source License

public RubixMemoryBlock(BlockProperties props, Configuration conf, ByteBuffer byteBuffer,
        Class<Tuple> valueClass, CompressionCodec codec, BlockSerializationType serializationType)
        throws IOException {
    this.props = props;
    this.serializationType = serializationType;
    this.byteBuffer = byteBuffer;
    this.tupleCreator = new RubixTupleCreator();

    this.inputStream = new ByteBufferBackedInputStream(byteBuffer);

    switch (serializationType) {
    case DEFAULT:
        SerializationFactory serializationFactory = new SerializationFactory(conf);
        deserializer = serializationFactory.getDeserializer(valueClass);
        break;//from w w  w.  ja v a2 s  .  c om
    case COMPACT:
        deserializer = new CompactDeserializer<Tuple>(props.getSchema());
        break;

    }

    if (codec == null) {
        deserializer.open(inputStream);
    } else {
        deserializer.open(codec.createInputStream(inputStream));
    }
    this.mark = this.currentTuplePosition = 0;
}