Example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

List of usage examples for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension.

Prototype

String getDefaultExtension();

Source Link

Document

Get the default filename extension for this kind of compression.

Usage

From source file:com.linkedin.cubert.io.rubix.RubixOutputFormat.java

License:Open Source License

@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    String extension = RubixConstants.RUBIX_EXTENSION;

    CompressionCodec codec = null;
    boolean isCompressed = getCompressOutput(context);

    if (isCompressed) {
        Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension += codec.getDefaultExtension();
    }//from ww w.j av  a2 s .  c  om

    Path file = getDefaultWorkFile(context, extension);
    FileSystem fs = file.getFileSystem(conf);

    FSDataOutputStream fileOut = fs.create(file, false);
    return new RubixRecordWriter<K, V>(conf, fileOut, context.getOutputKeyClass(),
            context.getOutputValueClass(), codec);
}

From source file:com.m6d.filecrush.crush.Crush.java

License:Apache License

boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*/*from  w  w w .  j  a va  2 s  .com*/
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

        try {
            String line;

            while (null != (line = reader.readLine())) {
                System.out.println(line);
            }
        } finally {
            reader.close();
        }

        return false;
    }

    if (cli.hasOption("verbose")) {
        console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
        console = Verbosity.INFO;
    } else {
        console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
        ignoredFilesMatcher = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    if (cli.hasOption("skip-regex")) {
        skippedFilesMatcher = Pattern.compile(cli.getOptionValue("skip-regex")).matcher("");
    }

    if (cli.hasOption("max-tasks")) {
        maxTasks = Integer.parseInt(cli.getOptionValue("max-tasks"));
    }

    if (cli.hasOption("job-name")) {
        job.set("mapreduce.job.name", cli.getOptionValue("job-name"));
    }

    removeEmptyFiles = cli.hasOption("remove-empty-files");

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
        /*
         * Stand alone mode accepts two arguments.
         */
        mode = Mode.STAND_ALONE;

        srcDir = new Path(nonOptions[0]);

        dest = new Path(nonOptions[1]);

        if (cli.hasOption("input-format")) {
            inFormats = asList(cli.getOptionValue("input-format"));
        }

        if (cli.hasOption("output-format")) {
            outFormats = asList(cli.getOptionValue("output-format"));
        }

        replacements = asList(dest.getName());

        crushTimestamp = Long.toString(currentTimeMillis());

    } else {
        /*
         * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
         * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
         * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
         * smaller.
         */

        if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length
                && args[2].length() != 14) {

            int maxTasks = Integer.parseInt(args[2]);

            if (maxTasks <= 0 || maxTasks > 4000) {
                throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
            }

            job.setInt("mapreduce.job.reduces", maxTasks);

            maxFileBlocks = Integer.MAX_VALUE;

            crushTimestamp = Long.toString(currentTimeMillis());

            srcDir = new Path(args[0]);
            dest = new Path(args[1]);

            mode = Mode.CLONE;

            if (args.length == 4) {
                if (args[3].equals("TEXT")) {
                    /*
                     * These are the defaults except with text input and output formats.
                     */
                    inFormats = asList(TextInputFormat.class.getName());
                    outFormats = asList(TextOutputFormat.class.getName());

                } else if (!args[3].equals("SEQUENCE")) {
                    throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
                }
            }
        } else {
            /*
             * V2 style arguments.
             */
            if (cli.hasOption("threshold")) {
                threshold = Double.parseDouble(cli.getOptionValue("threshold"));

                if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold)
                        || Double.isNaN(threshold)) {
                    throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold);
                }
            }

            if (cli.hasOption("max-file-blocks")) {
                int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

                if (0 > maxFileBlocksOption) {
                    throw new IllegalArgumentException(
                            "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
                }

                maxFileBlocks = maxFileBlocksOption;
            } else {
                maxFileBlocks = 8;
            }

            if (cli.hasOption("regex")) {
                regexes = asList(cli.getOptionValues("regex"));
            }

            if (cli.hasOption("replacement")) {
                replacements = asList(cli.getOptionValues("replacement"));
            }

            if (cli.hasOption("input-format")) {
                inFormats = asList(cli.getOptionValues("input-format"));
            }

            if (cli.hasOption("output-format")) {
                outFormats = asList(cli.getOptionValues("output-format"));
            }

            if (3 != nonOptions.length) {
                throw new IllegalArgumentException(
                        "Could not find source directory, out directory, and job timestamp");
            }

            srcDir = new Path(nonOptions[0]);
            dest = new Path(nonOptions[1]);

            crushTimestamp = nonOptions[2];

            if (cli.hasOption("clone")) {
                mode = Mode.CLONE;
            } else {
                mode = Mode.MAP_REDUCE;
            }

            if (!crushTimestamp.matches("\\d{14}")) {
                throw new IllegalArgumentException(
                        "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
            }
        }

        dfsBlockSize = parseDfsBlockSize(job);
        maxEligibleSize = (long) (dfsBlockSize * threshold);
        print(Verbosity.INFO, format("\nSmall file threshold: "
                + NumberFormat.getNumberInstance(Locale.US).format(maxEligibleSize) + " bytes\n"));
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFilesMatcher != null) {
        job.set("crush.ignore-regex", ignoredFilesMatcher.pattern().pattern());
    }

    if (skippedFilesMatcher != null) {
        job.set("crush.skip-regex", skippedFilesMatcher.pattern().pattern());
    }

    if (regexes.size() != replacements.size() || replacements.size() != inFormats.size()
            || inFormats.size() != outFormats.size()) {
        throw new IllegalArgumentException(
                "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
        job.set(format("crush.%d.regex", i), regexes.get(i));

        matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

        job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

        String inFmt = inFormats.get(i);

        if ("sequence".equals(inFmt)) {
            inFmt = SequenceFileInputFormat.class.getName();
        } else if ("text".equals(inFmt)) {
            inFmt = TextInputFormat.class.getName();
        } else if ("avro".equals(inFmt)) {
            inFmt = AvroContainerInputFormat.class.getName();
        } else if ("parquet".equals(inFmt)) {
            inFmt = MapredParquetInputFormat.class.getName();
        } else {
            try {
                if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
                    throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
            }
        }

        job.set(format("crush.%d.input.format", i), inFmt);

        String outFmt = outFormats.get(i);

        if ("sequence".equals(outFmt)) {
            outFmt = SequenceFileOutputFormat.class.getName();
        } else if ("text".equals(outFmt)) {
            outFmt = TextOutputFormat.class.getName();
        } else if ("avro".equals(outFmt)) {
            outFmt = AvroContainerOutputFormat.class.getName();
        } else if ("parquet".equals(outFmt)) {
            outFmt = MapredParquetOutputFormat.class.getName();
        } else {
            try {
                if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
                    throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
            }
        }

        job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");
    String codecClassName = null;

    if (null == codec || "deflate".equals(codec)) {
        codecClassName = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
        codecClassName = null;
    } else if ("gzip".equals(codec)) {
        codecClassName = GzipCodec.class.getName();
    } else if ("snappy".equals(codec)) {
        codecClassName = SnappyCodec.class.getName();
    } else if ("bzip2".equals(codec)) {
        codecClassName = BZip2Codec.class.getName();
    } else {
        codecClassName = codec;
        try {
            if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
                throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
            }
        } catch (ClassNotFoundException e) {
            throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
    }

    if (null == codecClassName) {
        job.setBoolean("mapreduce.output.fileoutputformat.compress", false);
        job.set("avro.output.codec", "null");
        job.set("parquet.compression", "uncompressed");
    } else {
        job.setBoolean("mapreduce.output.fileoutputformat.compress", true);
        job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
        job.set("mapreduce.output.fileoutputformat.compress.codec", codecClassName);
        job.set("avro.output.codec", codec);
        job.set("parquet.compression", codec);

        try {
            CompressionCodec instance = (CompressionCodec) Class.forName(codecClassName).newInstance();
            codecExtension = instance.getDefaultExtension();
        } catch (Exception e) {
            throw new AssertionError();
        }
    }

    return true;
}

From source file:com.m6d.filecrush.crush.TextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress)
        throws IOException {
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", "");
    if (!isCompressed) {
        Path file = FileOutputFormat.getTaskOutputPath(job, name);
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {/*from  w  w  w .  j av  a  2 s .c  o  m*/
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        // create the named codec
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
        // build the filename including the extension
        Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension());
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static OutputStream makePathAndOutputStream(JobContext job, Path outputPath, boolean deleteExisting)
        throws IOException {
    OutputStream retVal;/* w ww  .j  a v a  2  s  .c o  m*/
    FileSystem fs = outputPath.getFileSystem(job.getConfiguration());

    if (fs.exists(outputPath)) {
        if (deleteExisting) {
            fs.delete(outputPath, false);
        } else {
            throw new ISE("outputPath[%s] must not exist.", outputPath);
        }
    }

    if (!FileOutputFormat.getCompressOutput(job)) {
        retVal = fs.create(outputPath, false);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        outputPath = new Path(outputPath.toString() + codec.getDefaultExtension());

        retVal = codec.createOutputStream(fs.create(outputPath, false));
    }

    return retVal;
}

From source file:com.panguso.lc.analysis.format.mapreduce.TextOutputFormat.java

License:Open Source License

/**
 * @param job job/*from w w  w. ja  v a  2s  .c o  m*/
 * @throws IOException IOException
 * @throws InterruptedException InterruptedException
 */
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", separate);
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.pinterest.secor.parser.PartitionFinalizer.java

License:Apache License

public PartitionFinalizer(SecorConfig config) throws Exception {
    mConfig = config;//  w  w  w.ja v  a2s . co  m
    mKafkaClient = new KafkaClient(mConfig);
    mZookeeperConnector = new ZookeeperConnector(mConfig);
    mThriftMessageParser = new ThriftMessageParser(mConfig);
    mQuboleClient = new QuboleClient(mConfig);
    if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
        CompressionCodec codec = (CompressionCodec) ReflectionUtil
                .createCompressionCodec(mConfig.getCompressionCodec());
        mFileExtension = codec.getDefaultExtension();
    } else {
        mFileExtension = "";
    }
}

From source file:com.pinterest.secor.uploader.Uploader.java

License:Apache License

private void trim(LogFilePath srcPath, long startOffset) throws Exception {
    if (startOffset == srcPath.getOffset()) {
        return;/*from   w w w  .  ja  v  a2  s .c  o  m*/
    }
    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(config);
    String srcFilename = srcPath.getLogFilePath();
    Path srcFsPath = new Path(srcFilename);
    SequenceFile.Reader reader = null;
    SequenceFile.Writer writer = null;
    LogFilePath dstPath = null;
    int copiedMessages = 0;
    // Deleting the writer closes its stream flushing all pending data to the disk.
    mFileRegistry.deleteWriter(srcPath);
    try {
        reader = createReader(fs, srcFsPath, config);
        LongWritable key = (LongWritable) reader.getKeyClass().newInstance();
        BytesWritable value = (BytesWritable) reader.getValueClass().newInstance();
        CompressionCodec codec = null;
        String extension = "";
        if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
            codec = (CompressionCodec) ReflectionUtil.createCompressionCodec(mConfig.getCompressionCodec());
            extension = codec.getDefaultExtension();
        }
        while (reader.next(key, value)) {
            if (key.get() >= startOffset) {
                if (writer == null) {
                    String localPrefix = mConfig.getLocalPath() + '/' + IdUtil.getLocalMessageDir();
                    dstPath = new LogFilePath(localPrefix, srcPath.getTopic(), srcPath.getPartitions(),
                            srcPath.getGeneration(), srcPath.getKafkaPartition(), startOffset, extension);
                    writer = mFileRegistry.getOrCreateWriter(dstPath, codec);
                }
                writer.append(key, value);
                copiedMessages++;
            }
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
    mFileRegistry.deletePath(srcPath);
    if (dstPath == null) {
        LOG.info("removed file " + srcPath.getLogFilePath());
    } else {
        LOG.info("trimmed " + copiedMessages + " messages from " + srcFilename + " to "
                + dstPath.getLogFilePath() + " with start offset " + startOffset);
    }
}

From source file:com.ricemap.spateDB.core.GridRecordWriter.java

License:Apache License

/**
 * Returns path to a file in which the final cell will be written.
 * @param column//  w ww .  ja v  a2s  .c  o m
 * @param row
 * @return
 * @throws IOException 
 */
protected Path getFinalCellPath(int cellIndex) throws IOException {
    Path path = null;
    do {
        String filename = counter == 0 ? String.format("data_%05d", cellIndex)
                : String.format("data_%05d_%d", cellIndex, counter);
        boolean isCompressed = jobConf != null && FileOutputFormat.getCompressOutput(jobConf);
        if (isCompressed) {
            Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jobConf,
                    GzipCodec.class);
            // create the named codec
            CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jobConf);
            filename += codec.getDefaultExtension();
        }

        path = getFilePath(filename);
        counter++;
    } while (fileSystem.exists(path));
    return path;
}

From source file:com.ricemap.spateDB.mapred.TextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress)
        throws IOException {
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = job.get("mapred.textoutputformat.separator", "\t");
    if (!isCompressed) {
        Path file = FileOutputFormat.getTaskOutputPath(job, name);
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {//from   ww  w. ja va 2  s . c o m
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        // create the named codec
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
        // build the filename including the extension
        Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension());
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

private void testPath(CompressionCodec compressionCodec) throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    String template = getTestDir().toString()
            + "/${YYYY()}/${YY()}/${MM()}/${DD()}/${hh()}/${mm()}/${ss()}/${record:value('/')}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 20;
    long cutOffRecords = 2;
    HdfsFileType fileType = HdfsFileType.TEXT;
    SequenceFile.CompressionType compressionType = (compressionCodec == null)
            ? SequenceFile.CompressionType.NONE
            : SequenceFile.CompressionType.BLOCK;
    String keyEL = "uuid()";
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));

    if (compressionCodec == null) {
        Assert.assertEquals("", mgr.getExtension());
    } else {// w w w.ja  va2s.c  o  m
        Assert.assertEquals(compressionCodec.getDefaultExtension(), mgr.getExtension());
    }

    Date date = getFixedDate();
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    Assert.assertTrue(mgr.getPath(date, record).toString()
            .startsWith(new Path(getTestDir(), "2015/15/01/20/09/56/01/a/_tmp_" + prefix).toString()));
}