Example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec.

Prototype

public CompressionCodec getCodec(Path file) 

Source Link

Document

Find the relevant compression codec for the given file based on its filename suffix.

Usage

From source file:org.hedera.util.SeekableInputStream.java

License:Apache License

public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs,
        CompressionCodecFactory compressionCodecs) throws IOException {
    CompressionCodec codec = compressionCodecs.getCodec(path);
    FSDataInputStream din = fs.open(path);
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec;
            SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end,
                    SplittableCompressionCodec.READ_MODE.BYBLOCK);
            return new SeekableInputStream(cin);
        } else {//www.  j  ava 2s.  c om
            // non-splittable compression input stream
            // no seeking or offsetting is needed
            assert start == 0;
            CompressionInputStream cin = codec.createInputStream(din, decompressor);
            return new SeekableInputStream(cin, din);
        }
    } else {
        // non compression input stream
        // we seek to the start of the split
        din.seek(start);
        return new SeekableInputStream(din);
    }
}

From source file:org.hipi.tools.downloader.Downloader.java

License:Open Source License

public int run(String[] args) throws Exception {

    // try to parse command line arguments
    CommandLine line = null;//from  w w  w.  j a v  a  2s .c o  m
    try {
        line = parser.parse(options, args);
    } catch (ParseException exp) {
        usage();
    }
    if (line == null) {
        usage();
    }

    String[] leftArgs = line.getArgs();

    if (leftArgs.length != 2) {
        usage();
    }

    String inputDir = leftArgs[0];
    String outputHib = leftArgs[1];

    boolean yfcc100m = line.hasOption("yfcc100m");
    int numDownloadNodes = (yfcc100m ? 1
            : ((line.hasOption("num-nodes") ? Integer.parseInt(line.getOptionValue("num-nodes")) : 1)));
    if (numDownloadNodes < 1) {
        System.err.println("Invalid number of download nodes specified [" + numDownloadNodes + "]");
        System.exit(1);
    }

    boolean overwrite = line.hasOption("force");

    System.out.println("Source directory: " + inputDir);
    System.out.println("Output HIB: " + outputHib);
    System.out.println("Overwrite output HIB if it exists: " + (overwrite ? "true" : "false"));
    System.out.println("YFCC100M format: " + (yfcc100m ? "true" : "false"));
    System.out.println("Number of download nodes: " + numDownloadNodes);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    // Remove existing HIB if overwrite is specified and HIB exists
    if (!overwrite) {
        if (fs.exists(new Path(outputHib))) {
            System.err.println(
                    "HIB [" + outputHib + "] already exists. Use the \"--force\" argument to overwrite.");
            System.exit(1);
        }
    } else { // overwrite
        if (fs.exists(new Path(outputHib))) {
            System.out.println("Found that output HIB already exists, deleting.");
        }
    }

    fs.delete(new Path(outputHib), true);
    fs.delete(new Path(outputHib + ".dat"), true);
    fs.delete(new Path(outputHib + "_output"), true);

    // Scan source directory for list of input files
    FileStatus[] inputFiles = fs.listStatus(new Path(inputDir));
    if (inputFiles == null || inputFiles.length == 0) {
        System.err.println("Failed to find any files in source directory: " + inputDir);
        System.exit(1);
    }

    // Validate list of input files
    ArrayList<Path> sourceFiles = new ArrayList<Path>();
    for (FileStatus file : inputFiles) {

        Path path = file.getPath();

        if (yfcc100m) {
            String[] tokens = path.getName().split("-");
            if (tokens == null || tokens.length == 0) {
                System.out.println("  Skipping source file (does not follow YFCC100M file name convention): "
                        + file.getPath());
                continue;
            }
        }

        try {
            // If it exists, get the relevant compression codec
            CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
            CompressionCodec codec = codecFactory.getCodec(path);

            FSDataInputStream fis = fs.open(path);

            // If the codec was found, use it to create an decompressed input stream.
            // Otherwise, assume input stream is already decompressed
            BufferedReader reader = null;
            if (codec != null) {
                reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fis)));
            } else {
                reader = new BufferedReader(new InputStreamReader(fis));
            }

            String fileLine = reader.readLine();
            String[] lineFields = (yfcc100m ? fileLine.split("\t") : fileLine.split("\\s+"));

            if (yfcc100m) {
                if (lineFields.length != 23) {
                    System.out.println("  Skipping source file (does not follow YFCC100M source file format): "
                            + file.getPath());
                    String imageUri = null;
                } else {
                    System.out.println("  Adding source file: " + file.getPath());
                    sourceFiles.add(path);
                }
            } else {
                if (lineFields.length != 1) {
                    System.out.println(
                            "  Skipping source file (contains multiple fields per line where only one is expected): "
                                    + file.getPath());
                    if (lineFields.length == 23) {
                        System.out.println("  Did you mean to use \"--yfcc100m\"?");
                    }
                    String imageUri = null;
                } else {
                    System.out.println("  Adding source file: " + file.getPath());
                    sourceFiles.add(path);
                }
            }
            fis.close();
            reader = null;
        } catch (Exception e) {
            System.err.println("Skipping source file (unable to open and parse first line: " + file.getPath());
            continue;
        }

    }

    if (sourceFiles.size() == 0) {
        System.err.println("Failed to find any valid files in source directory: " + inputDir);
        System.exit(1);
    }

    // Construct path to directory containing outputHib
    String outputPath = outputHib.substring(0, outputHib.lastIndexOf('/') + 1);

    // Attaching job parameters to global Configuration object
    conf.setInt("downloader.nodes", numDownloadNodes);
    conf.setStrings("downloader.outfile", outputHib);
    conf.setStrings("downloader.outpath", outputPath);
    conf.setBoolean("downloader.yfcc100m", yfcc100m);

    Job job = Job.getInstance(conf, "hibDownload");
    job.setJarByClass(Downloader.class);
    job.setMapperClass(DownloaderMapper.class);
    job.setReducerClass(DownloaderReducer.class);
    job.setInputFormatClass(DownloaderInputFormat.class);
    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1);

    FileOutputFormat.setOutputPath(job, new Path(outputHib + "_output"));

    Path[] inputPaths = new Path[sourceFiles.size()];
    inputPaths = sourceFiles.toArray(inputPaths);
    DownloaderInputFormat.setInputPaths(job, inputPaths);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.icgc.dcc.release.core.hadoop.FileGlobInputStream.java

License:Open Source License

private static InputStream createDecodedInputStream(FileSystem fileSystem, Path file,
        CompressionCodecFactory factory) throws IOException {
    val codec = factory.getCodec(file);
    val decoded = codec == null;

    return decoded ? fileSystem.open(file) : codec.createInputStream(fileSystem.open(file));
}

From source file:org.mrgeo.hdfs.ingest.HdfsImageIngestDataProvider.java

License:Apache License

@Override
public InputStream openImage() throws IOException {

    Path path = new Path(getResourceName());

    final FileSystem fs = HadoopFileUtils.getFileSystem(conf, path);

    if (fs.exists(path)) {
        final InputStream stream = fs.open(path, 131072); // give open a 128K buffer

        Configuration localConf = HadoopUtils.createConfiguration();
        // see if were compressed
        final CompressionCodecFactory factory = new CompressionCodecFactory(localConf);
        final CompressionCodec codec = factory.getCodec(path);

        if (codec != null) {
            return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream));
        }//from  w ww  .  ja  va  2s.  c  o  m

        return stream;
    }

    throw new FileNotFoundException("File not found: " + path.toUri().toString());
}

From source file:org.mrgeo.image.geotools.GeotoolsRasterUtils.java

License:Apache License

private static InputStream openImageStream(String name) throws IOException {

    Path path = new Path(name);

    final FileSystem fs = HadoopFileUtils.getFileSystem(path);

    if (fs.exists(path)) {
        final InputStream stream = fs.open(path, 131072); // give open a 128K buffer

        Configuration localConf = HadoopUtils.createConfiguration();
        // see if were compressed
        final CompressionCodecFactory factory = new CompressionCodecFactory(localConf);
        final CompressionCodec codec = factory.getCodec(path);

        if (codec != null) {
            return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream));
        }/*from   www  . j  a v  a 2  s  .com*/

        return stream;
    }

    throw new FileNotFoundException("File not found: " + path.toUri().toString());
}

From source file:org.rassee.omniture.hadoop.mapred.OmnitureDataFileRecordReader.java

License:Open Source License

public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException {

    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;/*from  w  ww . ja v  a  2  s.  c  o  m*/
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.utils.TarballReader.java

License:Apache License

@Override
public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException {
    try {/*from  w ww  .  j  a v  a  2  s .  c o m*/
        pos = 0;
        end = Long.MAX_VALUE;
        key = new TarballEntry();
        value = new Text();

        FileSplit split = (FileSplit) isplit;
        Path file = split.getPath();
        tarball = file.getName();

        Configuration conf = context.getConfiguration();
        CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecs.getCodec(file);

        FileSystem fs = file.getFileSystem(conf);
        FSDataInputStream fileIn = fs.open(split.getPath());

        in = new TarInputStream(codec.createInputStream(fileIn));
    } catch (IOException ex) {
        Logger.getLogger(TarballReader.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:pl.edu.icm.coansys.commons.pig.udf.RichSequenceFileLoader.java

License:Open Source License

/**
 * @param path//  ww w .  j  a v a 2s .  c  om
 * @param job
 */
private void setCompression(Path path, Job job) {
    CompressionCodecFactory codecFactory = new CompressionCodecFactory(job.getConfiguration());
    CompressionCodec codec = codecFactory.getCodec(path);
    if (codec != null) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, codec.getClass());
    } else {
        FileOutputFormat.setCompressOutput(job, false);
    }
}