Example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

List of usage examples for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension.

Prototype

String getDefaultExtension();

Source Link

Document

Get the default filename extension for this kind of compression.

Usage

From source file:com.datascience.hadoop.CsvOutputFormat.java

License:Apache License

@Override
public RecordWriter<LongWritable, ListWritable<Text>> getRecordWriter(FileSystem fileSystem, JobConf conf,
        String name, Progressable progress) throws IOException {
    String charsetName = conf.get(CHARSET);
    Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8;

    Path path;/* w  w  w  .j a va2s.co  m*/
    if (FileOutputFormat.getCompressOutput(conf)) {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(conf,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
        path = FileOutputFormat.getTaskOutputPath(conf, name + codec.getDefaultExtension());
    } else {
        path = FileOutputFormat.getTaskOutputPath(conf, name);
    }
    return new CsvRecordWriter(new OutputStreamWriter(path.getFileSystem(conf).create(path, progress), charset),
            createFormat(conf));
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static String getCodecNameFromPath(Configuration conf, String path) {
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(new Path(path));
    if (codec == null)
        return null;
    else/*from www .j a va2s  .  c o m*/
        return codec.getDefaultExtension();
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static OutputStream getCodecOutputStream(Configuration conf, String codecName, OutputStream out)
        throws IOException {
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    String codecClassName = codecName;
    CodecType codecType = CodecType.getCodecType(codecName);
    if (codecType != null) {
        codecClassName = codecType.getIOCompressionCodecs();
    }/*from  w  w  w.j  a  v  a 2 s .  c  om*/
    System.out.println("codec class : " + codecClassName);
    CompressionCodec codec = compressionCodecs.getCodecByName(codecClassName);

    if (codec == null) {
        return out;
    }

    System.out.println("Getting OutputStream : " + codec.getDefaultExtension());
    System.out.println("Getting OutputStream : " + codec);
    Compressor compressor = codec.createCompressor();
    return codec.createOutputStream(out, compressor);
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in)
        throws IOException {

    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(new Path(path));
    if (codec == null)
        return in;
    System.out.println("Getting InputStream : " + codec.getDefaultExtension());
    System.out.println("Getting InputStream : " + codec);
    Decompressor compressor = codec.createDecompressor();
    in = codec.createInputStream(in, compressor);

    return in;/*  w ww .ja  v  a2  s  .c o m*/
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*/*from w w w.  j a v  a  2  s .  c o  m*/
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

        try {
            String line;

            while (null != (line = reader.readLine())) {
                System.out.println(line);
            }
        } finally {
            reader.close();
        }

        return false;
    }

    if (cli.hasOption("verbose")) {
        console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
        console = Verbosity.INFO;
    } else {
        console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
        ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
        /*
         * Stand alone mode accepts two arguments.
         */
        mode = Mode.STAND_ALONE;

        srcDir = new Path(nonOptions[0]);

        dest = new Path(nonOptions[1]);

        if (cli.hasOption("input-format")) {
            inFormats = asList(cli.getOptionValue("input-format"));
        }

        if (cli.hasOption("output-format")) {
            outFormats = asList(cli.getOptionValue("output-format"));
        }

        replacements = asList(dest.getName());

        crushTimestamp = Long.toString(currentTimeMillis());

    } else {
        /*
         * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
         * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
         * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
         * smaller.
         */

        if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length
                && args[2].length() != 14) {

            int maxTasks = Integer.parseInt(args[2]);

            if (maxTasks <= 0 || maxTasks > 4000) {
                throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
            }

            job.setInt("mapred.reduce.tasks", maxTasks);

            maxFileBlocks = Integer.MAX_VALUE;

            crushTimestamp = Long.toString(currentTimeMillis());

            srcDir = new Path(args[0]);
            dest = new Path(args[1]);

            mode = Mode.CLONE;

            if (args.length == 4) {
                if (args[3].equals("TEXT")) {
                    /*
                     * These are the defaults except with text input and output formats.
                     */
                    inFormats = asList(TextInputFormat.class.getName());
                    outFormats = asList(TextOutputFormat.class.getName());

                } else if (!args[3].equals("SEQUENCE")) {
                    throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
                }
            }
        } else {
            /*
             * V2 style arguments.
             */
            if (cli.hasOption("threshold")) {
                threshold = Double.parseDouble(cli.getOptionValue("threshold"));

                if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold)
                        || Double.isNaN(threshold)) {
                    throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold);
                }
            }

            if (cli.hasOption("max-file-blocks")) {
                int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

                if (0 > maxFileBlocksOption) {
                    throw new IllegalArgumentException(
                            "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
                }

                maxFileBlocks = maxFileBlocksOption;
            } else {
                maxFileBlocks = 8;
            }

            if (cli.hasOption("regex")) {
                regexes = asList(cli.getOptionValues("regex"));
            }

            if (cli.hasOption("replacement")) {
                replacements = asList(cli.getOptionValues("replacement"));
            }

            if (cli.hasOption("input-format")) {
                inFormats = asList(cli.getOptionValues("input-format"));
            }

            if (cli.hasOption("output-format")) {
                outFormats = asList(cli.getOptionValues("output-format"));
            }

            if (3 != nonOptions.length) {
                throw new IllegalArgumentException(
                        "Could not find source directory, out directory, and job timestamp");
            }

            srcDir = new Path(nonOptions[0]);
            dest = new Path(nonOptions[1]);

            crushTimestamp = nonOptions[2];

            if (cli.hasOption("clone")) {
                mode = Mode.CLONE;
            } else {
                mode = Mode.MAP_REDUCE;
            }

            if (!crushTimestamp.matches("\\d{14}")) {
                throw new IllegalArgumentException(
                        "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
            }
        }

        dfsBlockSize = parseDfsBlockSize(job);
        maxEligibleSize = (long) (dfsBlockSize * threshold);
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFiles != null) {
        job.set("crush.ignore-regex", ignoredFiles.pattern().pattern());
    }

    if (regexes.size() != replacements.size() || replacements.size() != inFormats.size()
            || inFormats.size() != outFormats.size()) {
        throw new IllegalArgumentException(
                "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
        job.set(format("crush.%d.regex", i), regexes.get(i));

        matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

        job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

        String inFmt = inFormats.get(i);

        if ("sequence".equals(inFmt)) {
            inFmt = SequenceFileInputFormat.class.getName();
        } else if ("text".equals(inFmt)) {
            inFmt = TextInputFormat.class.getName();
        } else {
            try {
                if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
                    throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
            }
        }

        job.set(format("crush.%d.input.format", i), inFmt);

        String outFmt = outFormats.get(i);

        if ("sequence".equals(outFmt)) {
            outFmt = SequenceFileOutputFormat.class.getName();
        } else if ("text".equals(outFmt)) {
            outFmt = TextOutputFormat.class.getName();
        } else {
            try {
                if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
                    throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
            }
        }

        job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");

    if (null == codec) {
        codec = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
        codec = null;
    } else if ("gzip".equals(codec)) {
        codec = GzipCodec.class.getName();
    } else {
        try {
            if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
                throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
            }
        } catch (ClassNotFoundException e) {
            throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
    }

    if (null == codec) {
        job.setBoolean("mapred.output.compress", false);
    } else {
        job.setBoolean("mapred.output.compress", true);
        job.set("mapred.output.compression.type", "BLOCK");
        job.set("mapred.output.compression.codec", codec);

        try {
            CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance();
            codecExtension = instance.getDefaultExtension();
        } catch (Exception e) {
            throw new AssertionError();
        }
    }

    return true;
}

From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java

License:Apache License

/**
 * Verifies that the work dir has the expected output.
 *///from ww  w  .j  a  v a 2  s  .  c o m
private void verifyOutput(String dir, String crushOutMask, Format inFmt, Format outFmt, CompressionCodec codec,
        String... fileNames) throws IOException {

    /*
     * Read format table
     *
     *         \   out format
     *          \
     * in format \ seq    | text
     * ----------------------------
     *      seq  | Custom | ascii |
     * -------------------------- -
     *      text | Text   | ascii |
     * ----------------------------
     */

    if (Format.TEXT == outFmt) {
        /*
         * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want.
         * We want to preserve the actual keys and values in the files, just like SequenceFileInputFormat. So, either way, the
         * keys and values should be the text representations of what went in.
         */
        BufferedReader reader;
        Path crushOut;

        if (null == codec) {
            Path path = new Path(dir + "/" + crushOutMask);

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask + " was not found in " + path);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(new InputStreamReader(getFileSystem().open(crushOut)));
        } else {
            Path path = new Path(dir + "/" + crushOutMask + codec.getDefaultExtension());

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(
                    new InputStreamReader(codec.createInputStream(getFileSystem().open(crushOut))));
        }

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) {
                String expectedLine = String.format("%d\t%d", key, value);
                assertThat(expectedLine, expected.add(expectedLine), is(true));

                String actualLine = reader.readLine();
                assertThat(actualLine, actual.add(actualLine), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.readLine(), nullValue());

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) {
        /*
         * Record reader will produce keys that are custom writables and values that are custom writable.
         */
        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        CustomWritable key = new CustomWritable();
        CustomWritable value = new CustomWritable();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) {

        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isCompressed(), is(true));

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        Text key = new Text();
        Text value = new Text();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else {
        fail();
    }
}

From source file:com.hp.hpit.cs.MyTextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t");
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*  w  ww.j a v a  2s.  co  m*/
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.inmobi.conduit.CompressedFileReaderTest.java

License:Apache License

private void uncompress(String fileName) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs;/* ww w .  j  a v  a2s  . c  om*/
    fs = FileSystem.getLocal(conf);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(new Path(fileName));
    if (codec == null) {
        System.out.println("cant find codec");
        System.exit(1);
    }
    LOG.info("Using compression codec [" + codec.toString() + "]");
    CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName)));
    OutputStream out = null;
    try {
        String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension());
        out = fs.create(new Path(outputURI + "-uncompressed"));
        org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf);
    } finally {
        org.apache.hadoop.io.IOUtils.closeStream(out);
        IOUtils.closeStream(is);

    }
}

From source file:com.jeffy.hdfs.compression.FileCompressor.java

License:Apache License

/**
 * @param args//from w ww .  jav  a  2  s .  com
 * ??????
 * ????
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    //??
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    // For example for the 'GzipCodec' codec class name the alias are 'gzip' and 'gzipcodec'.
    CompressionCodec codec = factory.getCodecByName(args[0]);
    if (codec == null) {//???
        System.err.println("Comperssion codec not found for " + args[0]);
        System.exit(1);
    }
    String ext = codec.getDefaultExtension();
    Compressor compressor = null;
    try {
        //?CodecPool?Compressor
        compressor = CodecPool.getCompressor(codec);
        for (int i = 1; i < args.length; i++) {
            String filename = args[i] + ext;
            System.out.println("Compression the file " + filename);
            try (FileSystem outFs = FileSystem.get(URI.create(filename), conf);
                    FileSystem inFs = FileSystem.get(URI.create(args[i]), conf);
                    InputStream in = inFs.open(new Path(args[i]))) {//
                //Compressor?
                CompressionOutputStream out = codec.createOutputStream(outFs.create(new Path(filename)),
                        compressor);
                //?????
                IOUtils.copy(in, out);
                out.finish();//?finish()?flush()???
                compressor.reset(); //???????java.io.IOException: write beyond end of stream
            }
        }
    } finally {//?Compressor??
        CodecPool.returnCompressor(compressor);
    }
}

From source file:com.jeffy.hdfs.compression.FileDecompressor.java

License:Apache License

/**
 * @param args/*from ww w  .  ja  v  a  2s  .c  o  m*/
 *            
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    //??
    Configuration conf = new Configuration();
    // ?
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    for (String uri : args) {
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        Path inputPath = new Path(uri);
        // ??????io.compression.codecs
        CompressionCodec codec = factory.getCodec(inputPath);
        // ??
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            continue;
        }
        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
        try (InputStream in = codec.createInputStream(fs.open(inputPath));
                OutputStream out = fs.create(new Path(outputUri))) {
            IOUtils.copyBytes(in, out, conf);
        }
    }
}