Example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension.

Prototype

String getDefaultExtension();

Source Link

Document

Get the default filename extension for this kind of compression.

Usage

From source file:com.linkedin.cubert.io.rubix.RubixOutputFormat.java

License:Open Source License

@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    String extension = RubixConstants.RUBIX_EXTENSION;

    CompressionCodec codec = null;
    boolean isCompressed = getCompressOutput(context);

    if (isCompressed) {
        Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension += codec.getDefaultExtension();
    }//from ww w.j av  a2 s .  c  om

    Path file = getDefaultWorkFile(context, extension);
    FileSystem fs = file.getFileSystem(conf);

    FSDataOutputStream fileOut = fs.create(file, false);
    return new RubixRecordWriter<K, V>(conf, fileOut, context.getOutputKeyClass(),
            context.getOutputValueClass(), codec);
}

From source file:com.m6d.filecrush.crush.Crush.java

License:Apache License

boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*/*from  w  w w .  j  a va  2 s  .com*/
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

        try {
            String line;

            while (null != (line = reader.readLine())) {
                System.out.println(line);
            }
        } finally {
            reader.close();
        }

        return false;
    }

    if (cli.hasOption("verbose")) {
        console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
        console = Verbosity.INFO;
    } else {
        console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
        ignoredFilesMatcher = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    if (cli.hasOption("skip-regex")) {
        skippedFilesMatcher = Pattern.compile(cli.getOptionValue("skip-regex")).matcher("");
    }

    if (cli.hasOption("max-tasks")) {
        maxTasks = Integer.parseInt(cli.getOptionValue("max-tasks"));
    }

    if (cli.hasOption("job-name")) {
        job.set("mapreduce.job.name", cli.getOptionValue("job-name"));
    }

    removeEmptyFiles = cli.hasOption("remove-empty-files");

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
        /*
         * Stand alone mode accepts two arguments.
         */
        mode = Mode.STAND_ALONE;

        srcDir = new Path(nonOptions[0]);

        dest = new Path(nonOptions[1]);

        if (cli.hasOption("input-format")) {
            inFormats = asList(cli.getOptionValue("input-format"));
        }

        if (cli.hasOption("output-format")) {
            outFormats = asList(cli.getOptionValue("output-format"));
        }

        replacements = asList(dest.getName());

        crushTimestamp = Long.toString(currentTimeMillis());

    } else {
        /*
         * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
         * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
         * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
         * smaller.
         */

        if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length
                && args[2].length() != 14) {

            int maxTasks = Integer.parseInt(args[2]);

            if (maxTasks <= 0 || maxTasks > 4000) {
                throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
            }

            job.setInt("mapreduce.job.reduces", maxTasks);

            maxFileBlocks = Integer.MAX_VALUE;

            crushTimestamp = Long.toString(currentTimeMillis());

            srcDir = new Path(args[0]);
            dest = new Path(args[1]);

            mode = Mode.CLONE;

            if (args.length == 4) {
                if (args[3].equals("TEXT")) {
                    /*
                     * These are the defaults except with text input and output formats.
                     */
                    inFormats = asList(TextInputFormat.class.getName());
                    outFormats = asList(TextOutputFormat.class.getName());

                } else if (!args[3].equals("SEQUENCE")) {
                    throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
                }
            }
        } else {
            /*
             * V2 style arguments.
             */
            if (cli.hasOption("threshold")) {
                threshold = Double.parseDouble(cli.getOptionValue("threshold"));

                if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold)
                        || Double.isNaN(threshold)) {
                    throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold);
                }
            }

            if (cli.hasOption("max-file-blocks")) {
                int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

                if (0 > maxFileBlocksOption) {
                    throw new IllegalArgumentException(
                            "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
                }

                maxFileBlocks = maxFileBlocksOption;
            } else {
                maxFileBlocks = 8;
            }

            if (cli.hasOption("regex")) {
                regexes = asList(cli.getOptionValues("regex"));
            }

            if (cli.hasOption("replacement")) {
                replacements = asList(cli.getOptionValues("replacement"));
            }

            if (cli.hasOption("input-format")) {
                inFormats = asList(cli.getOptionValues("input-format"));
            }

            if (cli.hasOption("output-format")) {
                outFormats = asList(cli.getOptionValues("output-format"));
            }

            if (3 != nonOptions.length) {
                throw new IllegalArgumentException(
                        "Could not find source directory, out directory, and job timestamp");
            }

            srcDir = new Path(nonOptions[0]);
            dest = new Path(nonOptions[1]);

            crushTimestamp = nonOptions[2];

            if (cli.hasOption("clone")) {
                mode = Mode.CLONE;
            } else {
                mode = Mode.MAP_REDUCE;
            }

            if (!crushTimestamp.matches("\\d{14}")) {
                throw new IllegalArgumentException(
                        "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
            }
        }

        dfsBlockSize = parseDfsBlockSize(job);
        maxEligibleSize = (long) (dfsBlockSize * threshold);
        print(Verbosity.INFO, format("\nSmall file threshold: "
                + NumberFormat.getNumberInstance(Locale.US).format(maxEligibleSize) + " bytes\n"));
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFilesMatcher != null) {
        job.set("crush.ignore-regex", ignoredFilesMatcher.pattern().pattern());
    }

    if (skippedFilesMatcher != null) {
        job.set("crush.skip-regex", skippedFilesMatcher.pattern().pattern());
    }

    if (regexes.size() != replacements.size() || replacements.size() != inFormats.size()
            || inFormats.size() != outFormats.size()) {
        throw new IllegalArgumentException(
                "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
        job.set(format("crush.%d.regex", i), regexes.get(i));

        matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

        job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

        String inFmt = inFormats.get(i);

        if ("sequence".equals(inFmt)) {
            inFmt = SequenceFileInputFormat.class.getName();
        } else if ("text".equals(inFmt)) {
            inFmt = TextInputFormat.class.getName();
        } else if ("avro".equals(inFmt)) {
            inFmt = AvroContainerInputFormat.class.getName();
        } else if ("parquet".equals(inFmt)) {
            inFmt = MapredParquetInputFormat.class.getName();
        } else {
            try {
                if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
                    throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
            }
        }

        job.set(format("crush.%d.input.format", i), inFmt);

        String outFmt = outFormats.get(i);

        if ("sequence".equals(outFmt)) {
            outFmt = SequenceFileOutputFormat.class.getName();
        } else if ("text".equals(outFmt)) {
            outFmt = TextOutputFormat.class.getName();
        } else if ("avro".equals(outFmt)) {
            outFmt = AvroContainerOutputFormat.class.getName();
        } else if ("parquet".equals(outFmt)) {
            outFmt = MapredParquetOutputFormat.class.getName();
        } else {
            try {
                if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
                    throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
            }
        }

        job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");
    String codecClassName = null;

    if (null == codec || "deflate".equals(codec)) {
        codecClassName = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
        codecClassName = null;
    } else if ("gzip".equals(codec)) {
        codecClassName = GzipCodec.class.getName();
    } else if ("snappy".equals(codec)) {
        codecClassName = SnappyCodec.class.getName();
    } else if ("bzip2".equals(codec)) {
        codecClassName = BZip2Codec.class.getName();
    } else {
        codecClassName = codec;
        try {
            if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
                throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
            }
        } catch (ClassNotFoundException e) {
            throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
    }

    if (null == codecClassName) {
        job.setBoolean("mapreduce.output.fileoutputformat.compress", false);
        job.set("avro.output.codec", "null");
        job.set("parquet.compression", "uncompressed");
    } else {
        job.setBoolean("mapreduce.output.fileoutputformat.compress", true);
        job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
        job.set("mapreduce.output.fileoutputformat.compress.codec", codecClassName);
        job.set("avro.output.codec", codec);
        job.set("parquet.compression", codec);

        try {
            CompressionCodec instance = (CompressionCodec) Class.forName(codecClassName).newInstance();
            codecExtension = instance.getDefaultExtension();
        } catch (Exception e) {
            throw new AssertionError();
        }
    }

    return true;
}

From source file:com.m6d.filecrush.crush.TextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress)
        throws IOException {
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", "");
    if (!isCompressed) {
        Path file = FileOutputFormat.getTaskOutputPath(job, name);
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {/*from  w  w  w .  j av  a  2 s .c  o  m*/
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        // create the named codec
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
        // build the filename including the extension
        Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension());
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static OutputStream makePathAndOutputStream(JobContext job, Path outputPath, boolean deleteExisting)
        throws IOException {
    OutputStream retVal;/* w ww  .j  a v a  2  s  .c o  m*/
    FileSystem fs = outputPath.getFileSystem(job.getConfiguration());

    if (fs.exists(outputPath)) {
        if (deleteExisting) {
            fs.delete(outputPath, false);
        } else {
            throw new ISE("outputPath[%s] must not exist.", outputPath);
        }
    }

    if (!FileOutputFormat.getCompressOutput(job)) {
        retVal = fs.create(outputPath, false);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        outputPath = new Path(outputPath.toString() + codec.getDefaultExtension());

        retVal = codec.createOutputStream(fs.create(outputPath, false));
    }

    return retVal;
}

From source file:com.panguso.lc.analysis.format.mapreduce.TextOutputFormat.java

License:Open Source License

/**
 * @param job job/*from w w  w. ja  v a  2s  .c o  m*/
 * @throws IOException IOException
 * @throws InterruptedException InterruptedException
 */
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", separate);
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.pinterest.secor.parser.PartitionFinalizer.java

License:Apache License

public PartitionFinalizer(SecorConfig config) throws Exception {
    mConfig = config;//  w  w  w.ja v  a2s . co  m
    mKafkaClient = new KafkaClient(mConfig);
    mZookeeperConnector = new ZookeeperConnector(mConfig);
    mThriftMessageParser = new ThriftMessageParser(mConfig);
    mQuboleClient = new QuboleClient(mConfig);
    if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
        CompressionCodec codec = (CompressionCodec) ReflectionUtil
                .createCompressionCodec(mConfig.getCompressionCodec());
        mFileExtension = codec.getDefaultExtension();
    } else {
        mFileExtension = "";
    }
}

From source file:com.pinterest.secor.uploader.Uploader.java

License:Apache License

private void trim(LogFilePath srcPath, long startOffset) throws Exception {
    if (startOffset == srcPath.getOffset()) {
        return;/*from   w w w  .  ja  v  a2  s .c  o  m*/
    }
    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(config);
    String srcFilename = srcPath.getLogFilePath();
    Path srcFsPath = new Path(srcFilename);
    SequenceFile.Reader reader = null;
    SequenceFile.Writer writer = null;
    LogFilePath dstPath = null;
    int copiedMessages = 0;
    // Deleting the writer closes its stream flushing all pending data to the disk.
    mFileRegistry.deleteWriter(srcPath);
    try {
        reader = createReader(fs, srcFsPath, config);
        LongWritable key = (LongWritable) reader.getKeyClass().newInstance();
        BytesWritable value = (BytesWritable) reader.getValueClass().newInstance();
        CompressionCodec codec = null;
        String extension = "";
        if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
            codec = (CompressionCodec) ReflectionUtil.createCompressionCodec(mConfig.getCompressionCodec());
            extension = codec.getDefaultExtension();
        }
        while (reader.next(key, value)) {
            if (key.get() >= startOffset) {
                if (writer == null) {
                    String localPrefix = mConfig.getLocalPath() + '/' + IdUtil.getLocalMessageDir();
                    dstPath = new LogFilePath(localPrefix, srcPath.getTopic(), srcPath.getPartitions(),
                            srcPath.getGeneration(), srcPath.getKafkaPartition(), startOffset, extension);
                    writer = mFileRegistry.getOrCreateWriter(dstPath, codec);
                }
                writer.append(key, value);
                copiedMessages++;
            }
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
    mFileRegistry.deletePath(srcPath);
    if (dstPath == null) {
        LOG.info("removed file " + srcPath.getLogFilePath());
    } else {
        LOG.info("trimmed " + copiedMessages + " messages from " + srcFilename + " to "
                + dstPath.getLogFilePath() + " with start offset " + startOffset);
    }
}

From source file:com.ricemap.spateDB.core.GridRecordWriter.java

License:Apache License

/**
 * Returns path to a file in which the final cell will be written.
 * @param column//  w ww .  ja v  a2s  .c  o m
 * @param row
 * @return
 * @throws IOException 
 */
protected Path getFinalCellPath(int cellIndex) throws IOException {
    Path path = null;
    do {
        String filename = counter == 0 ? String.format("data_%05d", cellIndex)
                : String.format("data_%05d_%d", cellIndex, counter);
        boolean isCompressed = jobConf != null && FileOutputFormat.getCompressOutput(jobConf);
        if (isCompressed) {
            Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jobConf,
                    GzipCodec.class);
            // create the named codec
            CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jobConf);
            filename += codec.getDefaultExtension();
        }

        path = getFilePath(filename);
        counter++;
    } while (fileSystem.exists(path));
    return path;
}

From source file:com.ricemap.spateDB.mapred.TextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress)
        throws IOException {
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = job.get("mapred.textoutputformat.separator", "\t");
    if (!isCompressed) {
        Path file = FileOutputFormat.getTaskOutputPath(job, name);
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {//from   ww  w. ja va 2  s . c o m
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        // create the named codec
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
        // build the filename including the extension
        Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension());
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

private void testPath(CompressionCodec compressionCodec) throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    String template = getTestDir().toString()
            + "/${YYYY()}/${YY()}/${MM()}/${DD()}/${hh()}/${mm()}/${ss()}/${record:value('/')}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 20;
    long cutOffRecords = 2;
    HdfsFileType fileType = HdfsFileType.TEXT;
    SequenceFile.CompressionType compressionType = (compressionCodec == null)
            ? SequenceFile.CompressionType.NONE
            : SequenceFile.CompressionType.BLOCK;
    String keyEL = "uuid()";
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));

    if (compressionCodec == null) {
        Assert.assertEquals("", mgr.getExtension());
    } else {// w w w.ja  va2s.c  o  m
        Assert.assertEquals(compressionCodec.getDefaultExtension(), mgr.getExtension());
    }

    Date date = getFixedDate();
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    Assert.assertTrue(mgr.getPath(date, record).toString()
            .startsWith(new Path(getTestDir(), "2015/15/01/20/09/56/01/a/_tmp_" + prefix).toString()));
}