Example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension.

Prototype

String getDefaultExtension();

Source Link

Document

Get the default filename extension for this kind of compression.

Usage

From source file:com.datascience.hadoop.CsvOutputFormat.java

License:Apache License

@Override
public RecordWriter<LongWritable, ListWritable<Text>> getRecordWriter(FileSystem fileSystem, JobConf conf,
        String name, Progressable progress) throws IOException {
    String charsetName = conf.get(CHARSET);
    Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8;

    Path path;/* w  w  w  .j a va2s.co  m*/
    if (FileOutputFormat.getCompressOutput(conf)) {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(conf,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
        path = FileOutputFormat.getTaskOutputPath(conf, name + codec.getDefaultExtension());
    } else {
        path = FileOutputFormat.getTaskOutputPath(conf, name);
    }
    return new CsvRecordWriter(new OutputStreamWriter(path.getFileSystem(conf).create(path, progress), charset),
            createFormat(conf));
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static String getCodecNameFromPath(Configuration conf, String path) {
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(new Path(path));
    if (codec == null)
        return null;
    else/*from www .j a va2s  .  c o m*/
        return codec.getDefaultExtension();
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static OutputStream getCodecOutputStream(Configuration conf, String codecName, OutputStream out)
        throws IOException {
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    String codecClassName = codecName;
    CodecType codecType = CodecType.getCodecType(codecName);
    if (codecType != null) {
        codecClassName = codecType.getIOCompressionCodecs();
    }/*from  w  w  w.j  a  v  a 2 s .  c  om*/
    System.out.println("codec class : " + codecClassName);
    CompressionCodec codec = compressionCodecs.getCodecByName(codecClassName);

    if (codec == null) {
        return out;
    }

    System.out.println("Getting OutputStream : " + codec.getDefaultExtension());
    System.out.println("Getting OutputStream : " + codec);
    Compressor compressor = codec.createCompressor();
    return codec.createOutputStream(out, compressor);
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in)
        throws IOException {

    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(new Path(path));
    if (codec == null)
        return in;
    System.out.println("Getting InputStream : " + codec.getDefaultExtension());
    System.out.println("Getting InputStream : " + codec);
    Decompressor compressor = codec.createDecompressor();
    in = codec.createInputStream(in, compressor);

    return in;/*  w ww .ja  v  a2  s  .c o m*/
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*/*from w w w.  j a v  a  2  s .  c o  m*/
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

        try {
            String line;

            while (null != (line = reader.readLine())) {
                System.out.println(line);
            }
        } finally {
            reader.close();
        }

        return false;
    }

    if (cli.hasOption("verbose")) {
        console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
        console = Verbosity.INFO;
    } else {
        console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
        ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
        /*
         * Stand alone mode accepts two arguments.
         */
        mode = Mode.STAND_ALONE;

        srcDir = new Path(nonOptions[0]);

        dest = new Path(nonOptions[1]);

        if (cli.hasOption("input-format")) {
            inFormats = asList(cli.getOptionValue("input-format"));
        }

        if (cli.hasOption("output-format")) {
            outFormats = asList(cli.getOptionValue("output-format"));
        }

        replacements = asList(dest.getName());

        crushTimestamp = Long.toString(currentTimeMillis());

    } else {
        /*
         * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
         * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
         * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
         * smaller.
         */

        if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length
                && args[2].length() != 14) {

            int maxTasks = Integer.parseInt(args[2]);

            if (maxTasks <= 0 || maxTasks > 4000) {
                throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
            }

            job.setInt("mapred.reduce.tasks", maxTasks);

            maxFileBlocks = Integer.MAX_VALUE;

            crushTimestamp = Long.toString(currentTimeMillis());

            srcDir = new Path(args[0]);
            dest = new Path(args[1]);

            mode = Mode.CLONE;

            if (args.length == 4) {
                if (args[3].equals("TEXT")) {
                    /*
                     * These are the defaults except with text input and output formats.
                     */
                    inFormats = asList(TextInputFormat.class.getName());
                    outFormats = asList(TextOutputFormat.class.getName());

                } else if (!args[3].equals("SEQUENCE")) {
                    throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
                }
            }
        } else {
            /*
             * V2 style arguments.
             */
            if (cli.hasOption("threshold")) {
                threshold = Double.parseDouble(cli.getOptionValue("threshold"));

                if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold)
                        || Double.isNaN(threshold)) {
                    throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold);
                }
            }

            if (cli.hasOption("max-file-blocks")) {
                int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

                if (0 > maxFileBlocksOption) {
                    throw new IllegalArgumentException(
                            "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
                }

                maxFileBlocks = maxFileBlocksOption;
            } else {
                maxFileBlocks = 8;
            }

            if (cli.hasOption("regex")) {
                regexes = asList(cli.getOptionValues("regex"));
            }

            if (cli.hasOption("replacement")) {
                replacements = asList(cli.getOptionValues("replacement"));
            }

            if (cli.hasOption("input-format")) {
                inFormats = asList(cli.getOptionValues("input-format"));
            }

            if (cli.hasOption("output-format")) {
                outFormats = asList(cli.getOptionValues("output-format"));
            }

            if (3 != nonOptions.length) {
                throw new IllegalArgumentException(
                        "Could not find source directory, out directory, and job timestamp");
            }

            srcDir = new Path(nonOptions[0]);
            dest = new Path(nonOptions[1]);

            crushTimestamp = nonOptions[2];

            if (cli.hasOption("clone")) {
                mode = Mode.CLONE;
            } else {
                mode = Mode.MAP_REDUCE;
            }

            if (!crushTimestamp.matches("\\d{14}")) {
                throw new IllegalArgumentException(
                        "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
            }
        }

        dfsBlockSize = parseDfsBlockSize(job);
        maxEligibleSize = (long) (dfsBlockSize * threshold);
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFiles != null) {
        job.set("crush.ignore-regex", ignoredFiles.pattern().pattern());
    }

    if (regexes.size() != replacements.size() || replacements.size() != inFormats.size()
            || inFormats.size() != outFormats.size()) {
        throw new IllegalArgumentException(
                "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
        job.set(format("crush.%d.regex", i), regexes.get(i));

        matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

        job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

        String inFmt = inFormats.get(i);

        if ("sequence".equals(inFmt)) {
            inFmt = SequenceFileInputFormat.class.getName();
        } else if ("text".equals(inFmt)) {
            inFmt = TextInputFormat.class.getName();
        } else {
            try {
                if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
                    throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
            }
        }

        job.set(format("crush.%d.input.format", i), inFmt);

        String outFmt = outFormats.get(i);

        if ("sequence".equals(outFmt)) {
            outFmt = SequenceFileOutputFormat.class.getName();
        } else if ("text".equals(outFmt)) {
            outFmt = TextOutputFormat.class.getName();
        } else {
            try {
                if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
                    throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
            }
        }

        job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");

    if (null == codec) {
        codec = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
        codec = null;
    } else if ("gzip".equals(codec)) {
        codec = GzipCodec.class.getName();
    } else {
        try {
            if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
                throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
            }
        } catch (ClassNotFoundException e) {
            throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
    }

    if (null == codec) {
        job.setBoolean("mapred.output.compress", false);
    } else {
        job.setBoolean("mapred.output.compress", true);
        job.set("mapred.output.compression.type", "BLOCK");
        job.set("mapred.output.compression.codec", codec);

        try {
            CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance();
            codecExtension = instance.getDefaultExtension();
        } catch (Exception e) {
            throw new AssertionError();
        }
    }

    return true;
}

From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java

License:Apache License

/**
 * Verifies that the work dir has the expected output.
 *///from ww  w  .j  a  v a 2  s  .  c o m
private void verifyOutput(String dir, String crushOutMask, Format inFmt, Format outFmt, CompressionCodec codec,
        String... fileNames) throws IOException {

    /*
     * Read format table
     *
     *         \   out format
     *          \
     * in format \ seq    | text
     * ----------------------------
     *      seq  | Custom | ascii |
     * -------------------------- -
     *      text | Text   | ascii |
     * ----------------------------
     */

    if (Format.TEXT == outFmt) {
        /*
         * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want.
         * We want to preserve the actual keys and values in the files, just like SequenceFileInputFormat. So, either way, the
         * keys and values should be the text representations of what went in.
         */
        BufferedReader reader;
        Path crushOut;

        if (null == codec) {
            Path path = new Path(dir + "/" + crushOutMask);

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask + " was not found in " + path);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(new InputStreamReader(getFileSystem().open(crushOut)));
        } else {
            Path path = new Path(dir + "/" + crushOutMask + codec.getDefaultExtension());

            FileStatus[] globStatus = getFileSystem().globStatus(path);

            if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
                fail(crushOutMask);
            }

            crushOut = globStatus[0].getPath();

            reader = new BufferedReader(
                    new InputStreamReader(codec.createInputStream(getFileSystem().open(crushOut))));
        }

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) {
                String expectedLine = String.format("%d\t%d", key, value);
                assertThat(expectedLine, expected.add(expectedLine), is(true));

                String actualLine = reader.readLine();
                assertThat(actualLine, actual.add(actualLine), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.readLine(), nullValue());

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) {
        /*
         * Record reader will produce keys that are custom writables and values that are custom writable.
         */
        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        CustomWritable key = new CustomWritable();
        CustomWritable value = new CustomWritable();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) {

        FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask));

        if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) {
            fail(crushOutMask);
        }

        Path crushOut = globStatus[0].getPath();

        Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf());

        assertThat(reader.isCompressed(), is(true));

        assertThat(reader.isBlockCompressed(), is(true));
        assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass()));

        Text key = new Text();
        Text value = new Text();

        Set<String> expected = new HashSet<String>();
        Set<String> actual = new HashSet<String>();

        for (String fileName : fileNames) {
            int max = Integer.parseInt(fileName.substring(4));

            for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
                reader.next(key, value);

                assertThat(expected.add(String.format("%s\t%s", k, v)), is(true));
                assertThat(actual.add(String.format("%s\t%s", key, value)), is(true));
            }
        }

        assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false));

        reader.close();

        assertThat(actual, equalTo(expected));

    } else {
        fail();
    }
}

From source file:com.hp.hpit.cs.MyTextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t");
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*  w  ww.j a v a  2s.  co  m*/
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.inmobi.conduit.CompressedFileReaderTest.java

License:Apache License

private void uncompress(String fileName) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs;/* ww w .  j  a v  a2s  . c  om*/
    fs = FileSystem.getLocal(conf);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(new Path(fileName));
    if (codec == null) {
        System.out.println("cant find codec");
        System.exit(1);
    }
    LOG.info("Using compression codec [" + codec.toString() + "]");
    CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName)));
    OutputStream out = null;
    try {
        String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension());
        out = fs.create(new Path(outputURI + "-uncompressed"));
        org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf);
    } finally {
        org.apache.hadoop.io.IOUtils.closeStream(out);
        IOUtils.closeStream(is);

    }
}

From source file:com.jeffy.hdfs.compression.FileCompressor.java

License:Apache License

/**
 * @param args//from w ww .  jav  a  2  s .  com
 * ??????
 * ????
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    //??
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    // For example for the 'GzipCodec' codec class name the alias are 'gzip' and 'gzipcodec'.
    CompressionCodec codec = factory.getCodecByName(args[0]);
    if (codec == null) {//???
        System.err.println("Comperssion codec not found for " + args[0]);
        System.exit(1);
    }
    String ext = codec.getDefaultExtension();
    Compressor compressor = null;
    try {
        //?CodecPool?Compressor
        compressor = CodecPool.getCompressor(codec);
        for (int i = 1; i < args.length; i++) {
            String filename = args[i] + ext;
            System.out.println("Compression the file " + filename);
            try (FileSystem outFs = FileSystem.get(URI.create(filename), conf);
                    FileSystem inFs = FileSystem.get(URI.create(args[i]), conf);
                    InputStream in = inFs.open(new Path(args[i]))) {//
                //Compressor?
                CompressionOutputStream out = codec.createOutputStream(outFs.create(new Path(filename)),
                        compressor);
                //?????
                IOUtils.copy(in, out);
                out.finish();//?finish()?flush()???
                compressor.reset(); //???????java.io.IOException: write beyond end of stream
            }
        }
    } finally {//?Compressor??
        CodecPool.returnCompressor(compressor);
    }
}

From source file:com.jeffy.hdfs.compression.FileDecompressor.java

License:Apache License

/**
 * @param args/*from ww w  .  ja  v  a  2s  .c  o  m*/
 *            
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    //??
    Configuration conf = new Configuration();
    // ?
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    for (String uri : args) {
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        Path inputPath = new Path(uri);
        // ??????io.compression.codecs
        CompressionCodec codec = factory.getCodec(inputPath);
        // ??
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            continue;
        }
        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
        try (InputStream in = codec.createInputStream(fs.open(inputPath));
                OutputStream out = fs.create(new Path(outputUri))) {
            IOUtils.copyBytes(in, out, conf);
        }
    }
}