Example usage for org.apache.hadoop.io.compress CompressionCodecFactory CompressionCodecFactory

List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory CompressionCodecFactory

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodecFactory CompressionCodecFactory.

Prototype

public CompressionCodecFactory(Configuration conf) 

Source Link

Document

Find the codecs specified in the config value io.compression.codecs and register them.

Usage

From source file:StreamWikiDumpInputFormat.java

License:Apache License

public void configure(JobConf conf) {
    this.compressionCodecs = new CompressionCodecFactory(conf);
}

From source file:be.ugent.intec.halvade.uploader.HalvadeUploader.java

License:Open Source License

public void parseArguments(String[] args) throws ParseException {
    createOptions();//w  w w.  j  a  v  a2  s  . c  o m
    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);
    manifest = line.getOptionValue("1");
    if (!manifest.endsWith(".manifest")) {
        file1 = manifest;
        manifest = null;
    }
    outputDir = line.getOptionValue("O");
    if (!outputDir.endsWith("/"))
        outputDir += "/";

    if (line.hasOption("2"))
        file2 = line.getOptionValue("2");
    if (line.hasOption("profile"))
        profile = line.getOptionValue("profile");
    if (line.hasOption("t"))
        mthreads = Integer.parseInt(line.getOptionValue("t"));
    if (line.hasOption("i"))
        isInterleaved = true;
    if (line.hasOption("sse"))
        SSE = true;
    if (line.hasOption("snappy")) {
        CompressionCodecFactory codecFactory = new CompressionCodecFactory(getConf());
        codec = codecFactory.getCodecByClassName("org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (line.hasOption("lz4")) {
        CompressionCodecFactory codecFactory = new CompressionCodecFactory(getConf());
        codec = codecFactory.getCodecByClassName("org.apache.hadoop.io.compress.Lz4Codec");
    }
    if (codec != null)
        Logger.DEBUG("Hadoop encryption: " + codec.getDefaultExtension().substring(1));
    if (line.hasOption("size"))
        bestFileSize = Integer.parseInt(line.getOptionValue("size")) * 1024 * 1024;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system./*  w w  w . ja v  a 2  s  .  c  o m*/
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    if (codec == null) { // no codec.  Uncompressed file.
        positionAtFirstRecord(fileIn);
        inputStream = fileIn;
    } else {
        // compressed file
        if (start != 0) {
            throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");
        }

        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
    }

    lineReader = new LineReader(inputStream);
}

From source file:ca.sparkera.adapters.mapreduce.MainframeVBRecordReader.java

License:Apache License

public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {

    start = splitStart;//from   w w w  . java  2 s  . c o m
    end = start + splitLength;
    LOG.info("Start of the split:" + start + "-End of split:" + end);
    LOG.debug("VLR initialize started: start pos:" + start + "endpos:" + end);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        numBytesRemainingInSplit = splitLength;
        LOG.info("Variable length input; cannot compute number of records in the split");

    }
    this.pos = start;
}

From source file:cn.lhfei.hadoop.ch04.FileDecompressor.java

License:Apache License

/**
 * use case: % hadoop FileDecompressor file.gz
 * @param args// w w w .j  a  v  a2 s . c  o m
 */
public static void main(String[] args) {
    FileSystem fs = null;
    String uri = args[0];
    Path inputPath = null;
    Configuration conf = new Configuration();
    CompressionCodecFactory factory = null;

    InputStream in = null;
    OutputStream out = null;

    try {
        fs = FileSystem.get(URI.create(uri), conf);
        inputPath = new Path(uri);
        factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            System.exit(1);
        }

        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

        in = codec.createInputStream(fs.open(inputPath));
        out = fs.create(new Path(outputUri));

        IOUtils.copyBytes(in, out, conf);

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();// w ww  .  j  a  v a2 s .c  o m
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key??
    key = new Text(file.getName());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameTextInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    if (null == codec) {
        return true;
    }//from w ww . ja  va  2s. c  o  m
    return codec instanceof SplittableCompressionCodec;
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*  w  w w  .  j a  v  a2 s .c  o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key?
    key = new Text(file.toString());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:co.cask.hydrator.plugin.batch.CopybookInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    Configuration conf = context.getConfiguration();
    Path path = new Path(conf.get(COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH));
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path);
    return (null == codec) ? true : codec instanceof SplittableCompressionCodec;
}

From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java

License:Apache License

/**
 * //from  www  .  j a  va 2s.  com
 * @param delimiter
 * @param column
 * 
 * 
 */

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF);
    this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0);
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}