Example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec.

Prototype

public CompressionCodec getCodec(Path file) 

Source Link

Document

Find the relevant compression codec for the given file based on its filename suffix.

Usage

From source file:com.hadoop.mapreduce.LzoLineRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();// w w w.  jav a 2 s.com
    end = start + split.getLength();
    final Path file = split.getPath();
    Configuration job = context.getConfiguration();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("No codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs.//from   w w  w . j  ava2s.com
 * 
 * @param fs
 *          File system that contains the file.
 * @param lzoFile
 *          the lzo file to index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");

    try {
        FSDataInputStream is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // for reading the header
        lzoIs = codec.createInputStream(is, decompressor);

        int numChecksums = decompressor.getChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksums));
        }
    } finally {
        if (lzoIs != null) {
            lzoIs.close();
        }

        if (os != null) {
            os.close();
        }
    }

    fs.rename(tmpOutputFile, outputFile);
}

From source file:com.inmobi.conduit.CompressedFileReaderTest.java

License:Apache License

private void uncompress(String fileName) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs;//from   w w  w  . j a  va2  s . co  m
    fs = FileSystem.getLocal(conf);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(new Path(fileName));
    if (codec == null) {
        System.out.println("cant find codec");
        System.exit(1);
    }
    LOG.info("Using compression codec [" + codec.toString() + "]");
    CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName)));
    OutputStream out = null;
    try {
        String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension());
        out = fs.create(new Path(outputURI + "-uncompressed"));
        org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf);
    } finally {
        org.apache.hadoop.io.IOUtils.closeStream(out);
        IOUtils.closeStream(is);

    }
}

From source file:com.jeffy.hdfs.compression.FileDecompressor.java

License:Apache License

/**
 * @param args//  w  w w  .j  a v a 2s  .co  m
 *            
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    //??
    Configuration conf = new Configuration();
    // ?
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    for (String uri : args) {
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        Path inputPath = new Path(uri);
        // ??????io.compression.codecs
        CompressionCodec codec = factory.getCodec(inputPath);
        // ??
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            continue;
        }
        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
        try (InputStream in = codec.createInputStream(fs.open(inputPath));
                OutputStream out = fs.create(new Path(outputUri))) {
            IOUtils.copyBytes(in, out, conf);
        }
    }
}

From source file:com.matthewrathbone.hadoop.MRTester.java

License:Apache License

public List<String> collectStrings(Path location) throws Exception {
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }//ww  w. j  a  va  2 s. co  m

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        String[] resulting = raw.split("\n");
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * Check file type in hdfs./*from ww  w . j  av a2s  .  c o  m*/
 * 
 * @param fs
 *            handle of {@link FileSystem}
 * 
 * @param path
 *            hdfs {@link Path}
 * 
 * @param conf
 *            {@link Configuration}
 * 
 * @return {@link HdfsFileType} TXT, TXT_COMP, SEQ
 * */
public static HdfsFileType checkFileType(FileSystem fs, Path path, Configuration conf) throws IOException {
    FSDataInputStream is = null;
    try {
        is = fs.open(path);
        /* file is empty, use TXT readerup */
        if (0 == is.available()) {
            return HdfsFileType.TXT;
        }

        switch (is.readShort()) {
        case 0x5345:
            if (is.readByte() == 'Q') {
                // TODO: add RCFile
                return HdfsFileType.SEQ;
            }
        default:
            is.seek(0);
            CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
            CompressionCodec codec = compressionCodecFactory.getCodec(path);
            if (null == codec)
                return HdfsFileType.TXT;
            else {
                return HdfsFileType.COMP_TXT;
            }
        }
    } catch (IOException e) {
        throw e;
    } finally {
        if (null != is) {
            try {
                is.close();
            } catch (Exception ex) {
            }

        }

    }
}

From source file:com.yahoo.glimmer.util.MapReducePartInputStreamEnumeration.java

License:Open Source License

public MapReducePartInputStreamEnumeration(FileSystem fileSystem, Path srcPath) throws IOException {
    this.fileSystem = fileSystem;

    CompressionCodecFactory factory = new CompressionCodecFactory(fileSystem.getConf());
    codecIfAny = factory.getCodec(srcPath);

    FileStatus srcFileStatus = fileSystem.getFileStatus(srcPath);
    if (srcFileStatus.isDirectory()) {
        // returns FileStatus objects sorted by filename.
        String partFilenamePattern = "part-?-?????";
        if (codecIfAny != null) {
            partFilenamePattern += codecIfAny.getDefaultExtension();
        }//from  w ww .  j a v  a2  s .c om
        Path partPathGlob = new Path(srcPath, partFilenamePattern);
        partFileStatuses = fileSystem.globStatus(partPathGlob);
    } else {
        partFileStatuses = new FileStatus[] { srcFileStatus };
    }

}

From source file:com.yahoo.glimmer.util.MergeSortTool.java

License:Open Source License

public static int mergeSort(FileSystem fs, List<Path> sourcePaths, Path outputPath,
        CompressionCodecFactory compressionCodecFactory) throws IOException {
    assert sourcePaths.size() > 0 : "No source paths given.";

    LOG.info("Sorted merge into " + outputPath.toString());
    OutputStream outputStream = fs.create(outputPath);

    CompressionCodec inputCompressionCodec = compressionCodecFactory.getCodec(sourcePaths.get(0));
    if (inputCompressionCodec != null) {
        LOG.info("Input compression codec " + inputCompressionCodec.getClass().getName());
    }/*from w  w  w .  jav a2s .  c om*/

    CompressionCodec outputCompressionCodec = compressionCodecFactory.getCodec(outputPath);
    if (outputCompressionCodec != null) {
        LOG.info("Output compression codec " + outputCompressionCodec.getClass().getName());
        outputStream = outputCompressionCodec.createOutputStream(outputStream);
    }

    List<BufferedReader> readers = new ArrayList<BufferedReader>();
    OutputStreamWriter writer = new OutputStreamWriter(outputStream);

    for (Path partPath : sourcePaths) {
        LOG.info("\tAdding source " + partPath.toString());
        InputStream inputStream = fs.open(partPath);
        if (inputCompressionCodec != null) {
            inputStream = inputCompressionCodec.createInputStream(inputStream);
        }
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
        readers.add(reader);
    }

    int count = ReadersWriterMergeSort.mergeSort(readers, writer);

    writer.close();
    for (BufferedReader reader : readers) {
        reader.close();
    }
    readers.clear();
    LOG.info("Processed " + count + " lines into " + outputPath.toString());
    return count;
}

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) throws Exception {
        String uri = args[0];/*  w  w w  . jav a 2 s. c  o m*/
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);

        Path inputPath = new Path(uri);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            System.exit(1);
        }

        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

        InputStream in = null;
        OutputStream out = null;
        try {
            in = codec.createInputStream(fs.open(inputPath));
            out = fs.create(new Path(outputUri));
            IOUtils.copyBytes(in, out, conf);
        } finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(out);
        }
    }

From source file:crunch.MaxTemperature.java

License:Apache License

private void createFile(Path input, int records, int recordLength) throws IOException {
        long fileSize = 4096;
        OutputStream out = fs.create(input, true, 4096, (short) 1, fileSize);
        CompressionCodecFactory codecFactory = new CompressionCodecFactory(new Configuration());
        CompressionCodec codec = codecFactory.getCodec(input);
        if (codec != null) {
            out = codec.createOutputStream(out);
        }/*from w w w.jav a2  s  . com*/
        Writer writer = new OutputStreamWriter(out);
        try {
            for (int n = 0; n < records; n++) {
                writer.write(line(n, recordLength));
                writer.write("\n");
            }
        } finally {
            writer.close();
        }
    }