Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:com.hadoop.mapred.DeprecatedLzoLineRecordReader.java

License:Open Source License

DeprecatedLzoLineRecordReader(Configuration conf, FileSplit split) throws IOException {
    start = split.getStart();//w ww. jav a  2  s. c om
    end = start + split.getLength();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(conf);
    codecFactory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = codecFactory.getCodec(file);
    if (codec == null) {
        throw new IOException("No LZO codec found, cannot run.");
    }

    // Open the file and seek to the next split.
    fileIn = fs.open(file);
    // Create input stream and read the file header.
    in = new LineReader(codec.createInputStream(fileIn), conf);
    if (start != 0) {
        fileIn.seek(start);

        // Read and ignore the first line.
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    pos = start;
}

From source file:com.hadoop.mapreduce.FourMcLineRecordReader.java

License:BSD License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();//from   w w w . ja  v a  2s .  com
    end = start + split.getLength();
    final Path file = split.getPath();
    Configuration job = HadoopUtils.getConfiguration(context);
    maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("Codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:com.hadoop.mapreduce.LzoLineRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();//from   w  w  w . j a  v a2 s .  c  om
    end = start + split.getLength();
    final Path file = split.getPath();
    Configuration job = context.getConfiguration();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("No codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java

License:Apache License

/**
 * Verifies that the work dir has the expected output.
 *///  www .ja v a 2 s .  c om
private void verifyOutput(String dir, String crushOutMask, Format inFmt, Format outFmt, CompressionCodec codec,
        String... fileNames) throws IOException {

    /*
     * Read format table
     *
     *         \   out format
     *          \
     * in format \ seq    | text
     * ----------------------------
     *      seq  | Custom | ascii |
     * -------------------------- -
     *      text | Text   | ascii |
     * ----------------------------
     */

    if (Format.TEXT == outFmt) {
        /*
         * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want.
         * We want to preserve the actual keys and values in the files, just like SequenceFileInputFormat. So, either way, the
         * keys and values should be the text representations of what went in.
         */
        BufferedReader reader;
        Path crushOut;

        if (null == codec) {
            Path path = new Path(dir + "/" + crushOutMask);

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask + " was not found in " + path);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(new InputStreamReader(getFileSystem().open(crushOut)));
        } else {
            Path path = new Path(dir + "/" + crushOutMask + codec.getDefaultExtension());

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(
                    new InputStreamReader(codec.createInputStream(getFileSystem().open(crushOut))));
        }

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) {
                String expectedLine = String.format("%d\t%d", key, value);
                assertThat(expectedLine, expected.add(expectedLine), is(true));

                String actualLine = reader.readLine();
                assertThat(actualLine, actual.add(actualLine), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.readLine(), nullValue());

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) {
        /*
         * Record reader will produce keys that are custom writables and values that are custom writable.
         */
        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        CustomWritable key = new CustomWritable();
        CustomWritable value = new CustomWritable();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) {

        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isCompressed(), is(true));

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        Text key = new Text();
        Text value = new Text();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else {
        fail();
    }
}

From source file:com.inmobi.conduit.CompressedFileReaderTest.java

License:Apache License

private void uncompress(String fileName) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs;/*from w  ww  .ja va  2s  .  c om*/
    fs = FileSystem.getLocal(conf);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(new Path(fileName));
    if (codec == null) {
        System.out.println("cant find codec");
        System.exit(1);
    }
    LOG.info("Using compression codec [" + codec.toString() + "]");
    CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName)));
    OutputStream out = null;
    try {
        String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension());
        out = fs.create(new Path(outputURI + "-uncompressed"));
        org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf);
    } finally {
        org.apache.hadoop.io.IOUtils.closeStream(out);
        IOUtils.closeStream(is);

    }
}

From source file:com.inmobi.conduit.distcp.tools.mapred.RetriableFileCopyCommand.java

License:Apache License

private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream, int bufferSize,
        Mapper.Context context, Map<Long, Long> received) throws IOException {
    Path source = sourceFileStatus.getPath();
    ThrottledInputStream inStream = null;
    final CompressionCodec codec = compressionCodecs.getCodec(source);
    InputStream compressedIn = null;
    OutputStream commpressedOut = null;
    BufferedReader reader = null;
    long numberOfLinesRead = 0;

    try {//from w  w  w  .  j  a  v  a  2s .  c o  m
        inStream = getInputStream(source, HadoopCompat.getTaskConfiguration(context));
        compressedIn = codec.createInputStream(inStream);
        commpressedOut = codec.createOutputStream(outStream);
        // LineReader reader = new LineReader(compressedIn,
        // context.getConfiguration(), null);
        reader = new BufferedReader(new InputStreamReader(compressedIn));
        byte[] bytesRead = readLine(reader);
        while (bytesRead != null) {
            numberOfLinesRead++;
            commpressedOut.write(bytesRead);
            commpressedOut.write("\n".getBytes());
            updateContextStatus(inStream.getTotalBytesRead(), context, sourceFileStatus, numberOfLinesRead);
            if (received != null) {
                byte[] decodedMsg = Base64.decodeBase64(bytesRead);
                incrementReceived(decodedMsg, received);
            }
            bytesRead = readLine(reader);
        }
        HadoopCompat.incrementCounter(HadoopCompat.getCounter(context, CopyMapper.Counter.SLEEP_TIME_MS),
                inStream.getTotalSleepTime());
        LOG.info("STATS: " + inStream);
    } finally {
        IOUtils.cleanup(LOG, inStream, reader, compressedIn);
        try {
            if (commpressedOut != null)
                commpressedOut.close();
            outStream.close();
        } catch (IOException exception) {
            LOG.error("Could not close output-stream. ", exception);
            throw exception;
        }
    }

    return inStream.getTotalBytesRead();// totalBytesRead;
}

From source file:com.jeffy.hdfs.compression.FileDecompressor.java

License:Apache License

/**
 * @param args//w w w  .  ja  va  2 s  .c  o m
 *            
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    //??
    Configuration conf = new Configuration();
    // ?
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    for (String uri : args) {
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        Path inputPath = new Path(uri);
        // ??????io.compression.codecs
        CompressionCodec codec = factory.getCodec(inputPath);
        // ??
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            continue;
        }
        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
        try (InputStream in = codec.createInputStream(fs.open(inputPath));
                OutputStream out = fs.create(new Path(outputUri))) {
            IOUtils.copyBytes(in, out, conf);
        }
    }
}

From source file:com.knewton.mrtool.io.JsonRecordReader.java

License:Apache License

/**
 * Get the line reader to be used for the file. A <code>LineReader</code> can read a file line
 * by line. This separate method helps with testing too.
 * // w  w w .  j ava 2 s.  c om
 * @param fileSplit
 * @param conf
 * @return
 * @throws IOException
 */
protected LineReader initLineReader(FileSplit fileSplit, Configuration conf) throws IOException {
    final Path file = fileSplit.getPath();
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(fileSplit.getPath());
    seekableIn = fileIn;
    boolean skipFirstLine = false;
    LineReader lineReader;
    if (codec != null) {
        lineReader = new LineReader(codec.createInputStream(fileIn), conf);
    } else {
        // if the start is not the beginning of the file then skip the first line to get the
        // next complete json record. The previous json record will be read by the record reader
        // that got assigned the previous InputSplit.
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new LineReader(fileIn, conf);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    return lineReader;
}

From source file:com.linkedin.cubert.block.BlockUtils.java

License:Open Source License

@SuppressWarnings("unchecked")
public static Block loadBlock(BlockProperties props, IndexEntry indexEntry, Configuration conf, JsonNode json,
        BlockSerializationType serializationType, boolean isInMemoryBlock) throws IOException,
        ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException {
    Block block;//from   w  w w  . j a  va  2s .c  o  m
    if (indexEntry == null) {
        if (emptyForMissing)
            return new EmptyBlock(props);

        throw new IOException(String.format("Index entry is null"));
    }

    // populate props
    props.setBlockId(indexEntry.getBlockId());
    props.setNumRecords(indexEntry.getNumRecords());

    // Open the file and seek to the offset for this block
    Path file = new Path(indexEntry.getFile());
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fsin = fs.open(file, BLOCK_BUFFER_SIZE);
    fsin.seek(indexEntry.getOffset());

    // Gather information needed to read this block
    Class<Tuple> valueClass = (Class<Tuple>) TupleFactory.getInstance().newTuple().getClass();
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);

    // Load the block now
    if (isInMemoryBlock) {
        print.f("LOADING IN MEMORY the block %d", indexEntry.getBlockId());

        ByteBuffer byteBuffer = inMemoryBlockCache.get(indexEntry);

        if (byteBuffer == null) {
            int read = 0;
            byte[] data = new byte[(int) indexEntry.getLength()];
            while (read != data.length) {
                read += fsin.read(data, read, data.length - read);
            }
            fsin.close();

            byteBuffer = ByteBuffer.wrap(data);

            inMemoryBlockCache.put(indexEntry, byteBuffer);
        } else {
            print.f("REUSED FROM CACHE!!");
            byteBuffer.rewind();
        }

        block = new RubixMemoryBlock(props, conf, byteBuffer, valueClass, codec, serializationType);
        block.configure(json);
        return block;
    } else {
        print.f("STREAMING the block %d", indexEntry.getBlockId());
        InputStream in = new BlockInputStream(fsin, indexEntry.getLength());

        if (codec != null) {
            in = codec.createInputStream(in);
        }

        block = new CubertBlock(props,
                new BlockIterator<Tuple>(conf, in, valueClass, serializationType, props.getSchema()));
        block.configure(json);

        print.f("Loaded block id=%d from file=%s offset=%d length=%d", indexEntry.getBlockId(), file.toString(),
                indexEntry.getOffset(), indexEntry.getLength());

        return block;
    }
}

From source file:com.linkedin.cubert.io.rubix.RubixMemoryBlock.java

License:Open Source License

public RubixMemoryBlock(BlockProperties props, Configuration conf, ByteBuffer byteBuffer,
        Class<Tuple> valueClass, CompressionCodec codec, BlockSerializationType serializationType)
        throws IOException {
    this.props = props;
    this.serializationType = serializationType;
    this.byteBuffer = byteBuffer;
    this.tupleCreator = new RubixTupleCreator();

    this.inputStream = new ByteBufferBackedInputStream(byteBuffer);

    switch (serializationType) {
    case DEFAULT:
        SerializationFactory serializationFactory = new SerializationFactory(conf);
        deserializer = serializationFactory.getDeserializer(valueClass);
        break;//from w w  w.  ja v a2 s  .  c om
    case COMPACT:
        deserializer = new CompactDeserializer<Tuple>(props.getSchema());
        break;

    }

    if (codec == null) {
        deserializer.open(inputStream);
    } else {
        deserializer.open(codec.createInputStream(inputStream));
    }
    this.mark = this.currentTuplePosition = 0;
}