Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass)

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:cn.lhfei.hadoop.ch04.MaxTemperatureWithCompression.java

License:Apache License

public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage: MaxTemperatureWithCompression <input path> " + "<output path>");
        System.exit(-1);/*from  w w  w  .ja  v  a2  s.  co  m*/
    }

    try {
        Job job = new Job();
        job.setJarByClass(MaxTemperatureWithCompression.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

        job.setMapperClass(MaxTemperatureMapper.class);
        job.setCombinerClass(MaxTemperatureReducer.class);
        job.setReducerClass(MaxTemperatureReducer.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    } catch (IOException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

}

From source file:com.cloudera.oryx.computation.common.JobStep.java

License:Open Source License

/**
 * Creates a new {@link MRPipeline} instance that contains common configuration
 * settings./*w ww.  j  a va 2  s.com*/
 *
 * @return a new {@link MRPipeline} instance, suitably configured
 */
protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException {
    Configuration conf = OryxConfiguration.get(getConf());

    conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
    conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    conf.setBoolean("mapred.output.compress", true);
    conf.set("mapred.output.compression.type", "BLOCK");
    conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class);
    // Set old-style equivalents for Avro/Crunch's benefit
    conf.set("avro.output.codec", "snappy");

    conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true);
    conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true);
    conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true);
    conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1);

    //conf.setBoolean("crunch.disable.deep.copy", true);
    // Giving one mapper a lot of data can cause issues in some stages, so default to disable this
    conf.setBoolean("crunch.disable.combine.file", true);

    Config appConfig = ConfigUtils.getDefaultConfig();

    conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir"));

    int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb");
    log.info("Mapper memory: {}", mapMemoryMB);
    int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default
    log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB);
    if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) {
        log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS,
                conf.get(MRJobConfig.MAP_JAVA_OPTS));
    }
    conf.set(MRJobConfig.MAP_JAVA_OPTS,
            "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC");
    log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS));
    // See comment below on CM
    conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB);

    int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb");
    log.info("Reducer memory: {}", reduceMemoryMB);
    if (isHighMemoryStep()) {
        reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor");
        log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB);
    }
    conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB);

    int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default
    log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB);
    if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) {
        log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS,
                conf.get(MRJobConfig.REDUCE_JAVA_OPTS));
    }
    conf.set(MRJobConfig.REDUCE_JAVA_OPTS,
            "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC");
    log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS));
    // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in
    // -Xmx appended to opts above, which is at worst redundant
    conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB);

    conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128);
    conf.setInt("yarn.app.mapreduce.am.resource.mb", 384);

    // Pass total config state
    conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render());

    // Make sure to set any args to conf above this line!

    setConf(conf);

    Job job = Job.getInstance(conf);

    // Basic File IO settings
    FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);

    log.info("Created pipeline configuration {}", job.getConfiguration());

    return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration());
}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

/**
 *
 * @param inputPaths/*from   w w  w. j a v a 2s . co  m*/
 * @param outputPath
 * @param inputFormat
 * @param inputKey
 * @param inputValue
 * @param mapper
 * @param mapperKey
 * @param mapperValue
 * @param combiner
 * @param reducer
 * @param outputKey
 * @param outputValue
 * @param outputFormat
 * @param conf
 * @param overwrite
 * @param isCompress
 * @return
 * @throws IOException
 */
public static Job prepareAvroJob(String inputPaths, String outputPath, Class<? extends InputFormat> inputFormat,
        Object inputKey, Object inputValue, Class<? extends Mapper> mapper, Object mapperKey,
        Object mapperValue, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer,
        Object outputKey, Object outputValue, Class<? extends OutputFormat> outputFormat, Configuration conf,
        boolean overwrite, boolean isCompress) throws IOException {

    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();
    if (inputKey instanceof Schema) {

        if (inputValue instanceof Schema) {
            inputFormat = inputFormat == null ? AvroKeyValueInputFormat.class : inputFormat;
        }
        inputFormat = inputFormat == null ? AvroKeyInputFormat.class : inputFormat;

    }
    if (inputFormat != null) {
        job.setInputFormatClass(inputFormat);
    }

    if (inputKey instanceof Schema) {
        AvroJob.setInputKeySchema(job, (Schema) inputKey);
    }

    if (inputValue instanceof Schema) {
        AvroJob.setInputValueSchema(job, (Schema) inputValue);
    }

    if (outputKey instanceof Schema) {

        if (outputValue instanceof Schema) {
            outputFormat = outputFormat == null ? AvroKeyValueOutputFormat.class : outputFormat;
        }
        outputFormat = outputFormat == null ? AvroKeyOutputFormat.class : outputFormat;

    }
    if (outputFormat != null) {
        job.setOutputFormatClass(outputFormat);
    }

    if (outputKey instanceof Schema) {
        AvroJob.setOutputKeySchema(job, (Schema) outputKey);
    } else if (outputKey instanceof Class) {
        job.setOutputKeyClass((Class) outputKey);
    }

    if (outputValue instanceof Schema) {
        AvroJob.setOutputValueSchema(job, (Schema) outputValue);
    } else if (outputValue instanceof Class) {
        job.setOutputValueClass((Class) outputValue);
    }

    if (reducer == null) {
        job.setNumReduceTasks(0);

        if (mapperKey instanceof Schema) {
            AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey);
        } else if (mapperKey instanceof Class) {
            job.setOutputKeyClass((Class) mapperKey);
        }

        if (mapperValue instanceof Schema) {
            AvroJob.setOutputValueSchema(job, (Schema) mapperValue);
        } else if (mapperKey instanceof Class) {
            job.setOutputValueClass((Class) mapperValue);
        }
        job.setJarByClass(mapper);

    } else if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }
        job.setJarByClass(mapper);

    } else {
        job.setJarByClass(reducer);

    }

    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if (isCompress) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class);
    }

    job.setMapperClass(mapper);
    if (mapperKey instanceof Schema) {
        AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey);
    } else if (mapperKey instanceof Class) {
        job.setMapOutputKeyClass((Class) mapperKey);
    }

    if (mapperValue instanceof Schema) {
        AvroJob.setMapOutputValueSchema(job, (Schema) mapperValue);
    } else if (mapperKey instanceof Class) {
        job.setMapOutputValueClass((Class) mapperValue);
    }

    if (reducer != null) {
        job.setReducerClass(reducer);
    }
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }

    if (overwrite) {
        HadoopUtils.delete(jobConf, new Path(outputPath));
    }

    return job;

}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

public static Job prepareAvroJob(String inputPaths, Path outputPath, Schema inputKeySchema,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends Reducer> combiner,
        Class<? extends Reducer> reducer, Schema outputKeySchema, Class<? extends Writable> outputValue,
        Configuration conf, boolean overwrite) throws IOException {
    Job job = Job.getInstance(conf);//from w  w  w .j a v  a 2s  . c  o m
    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }
        job.setJarByClass(mapper);
    } else {
        job.setJarByClass(reducer);
    }

    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class);

    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, inputKeySchema);
    job.setMapperClass(mapper);
    if (mapperKey != null) {
        job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
        job.setMapOutputValueClass(mapperValue);
    }
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    job.setReducerClass(reducer);
    AvroJob.setOutputKeySchema(job, outputKeySchema);
    job.setOutputValueClass(outputValue);

    if (overwrite) {
        HadoopUtils.delete(jobConf, outputPath);
    }

    return job;

}

From source file:com.hadoop.mapreduce.TestLzoLazyLoading.java

License:Open Source License

private void runWordCount(Configuration cf, boolean compressIn, boolean compressOut)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration thisConf = new Configuration(cf);
    if (compressIn) {
        thisConf.setBoolean("mapred.compression.lzo.test.codec-checked-after-map", true);
    }// w  ww .j  a  v a  2s .c  o  m

    if (compressOut) {
        thisConf.setBoolean("mapred.compression.lzo.test.codec-checked-after-reduce", true);
    }
    Path pathIn = new Path(TEST_ROOT_DIR + "/in");
    Path pathOut = new Path(TEST_ROOT_DIR + "/out");
    localFs.delete(pathIn, true);
    localFs.delete(pathOut, true);
    writeFile(makeFileName("in/part1", compressIn), "this is a test\nof word count test\ntest\n");
    writeFile(makeFileName("in/part2", compressIn), "more test");
    Job job = new Job(thisConf, "word count");
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    if (compressOut) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, LzoCodec.class);
    }
    FileInputFormat.addInputPath(job, pathIn);
    FileOutputFormat.setOutputPath(job, pathOut);
    job.submit();
    assertEquals("IsLzoChecked (client)?", compressIn, LzoCodec.isNativeLzoChecked());
    assertTrue(job.waitForCompletion(false));
    String result = readFile(makeFileName("out/part-r-00000", compressOut));
    System.out.println(result);
    assertEquals("a\t1\ncount\t1\nis\t1\nmore\t1\nof\t1\ntest\t4\nthis\t1\nword\t1\n", result);
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.AverageJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, AverageJob.class.getSimpleName());
    job.setJarByClass(AverageJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample Average Job");
    job.setMapperClass(AverageMapper.class);
    job.setCombinerClass(AverageCombiner.class);
    job.setReducerClass(AverageReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;/*from   w w  w .  j a va2 s  .c o m*/
}

From source file:com.knewton.mapreduce.example.SSTableMRExample.java

License:Apache License

public static void main(String[] args)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException, ParseException {

    long startTime = System.currentTimeMillis();
    Options options = buildOptions();/*from   ww w  . ja  v  a 2 s.c om*/

    CommandLineParser cliParser = new BasicParser();
    CommandLine cli = cliParser.parse(options, args);
    if (cli.getArgs().length < 2 || cli.hasOption('h')) {
        printUsage(options);
    }
    Job job = getJobConf(cli);

    job.setJarByClass(SSTableMRExample.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(StudentEventWritable.class);

    job.setMapperClass(StudentEventMapper.class);
    job.setReducerClass(StudentEventReducer.class);

    job.setInputFormatClass(SSTableColumnInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    // input arg
    String inputPaths = cli.getArgs()[0];
    LOG.info("Setting initial input paths to {}", inputPaths);
    SSTableInputFormat.addInputPaths(job, inputPaths);
    // output arg
    FileOutputFormat.setOutputPath(job, new Path(cli.getArgs()[1]));
    if (cli.hasOption('c')) {
        LOG.info("Using compression for output.");
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        FileOutputFormat.setCompressOutput(job, true);
    }
    job.waitForCompletion(true);
    LOG.info("Total runtime: {}s", (System.currentTimeMillis() - startTime) / 1000);
}

From source file:com.metamx.milano.pig.MilanoStoreFunc.java

License:Apache License

/**
 * This does the setup for the mapper/reducer side.
 *
 * @param location The output path./*from ww  w.j ava  2  s  .co  m*/
 * @param job      The job config.
 *
 * @throws IOException Currently not thrown, but is part of the overridden signature.
 */
@Override
public void setStoreLocation(String location, Job job) throws IOException {
    FileOutputFormat.setOutputPath(job, new Path(location));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    Properties props = getUDFProps();

    job.getConfiguration().set("com.metamx.milano.proto.descriptor.base64",
            (String) props.get("milano.pig.proto.schema.base64"));
}

From source file:com.mozilla.main.ReadHBaseWriteHdfs.java

License:LGPL

@Override
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapred.job.queue.name", "prod");
    Job job = new Job(conf, "ReadHBaseWriteHDFS");
    job.setJarByClass(ReadHBaseWriteHdfs.class);
    Scan scan = new Scan();
    scan.addFamily("data".getBytes());

    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, ReadHBaseWriteHdfsMapper.class, Text.class,
            Text.class, job);

    job.setReducerClass(ReadHBaseWriteHdfsReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1000);//from   ww  w . j  ava  2s. c o  m

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputPath(job, new Path(args[0]));

    job.waitForCompletion(true);
    if (job.isSuccessful()) {
        System.out.println("DONE");
    }

    return 0;
}

From source file:com.mozilla.pig.storage.SeqFileMultiStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.setOutputKeyClass(this.keyClass);
    job.setOutputKeyClass(this.keyClass);
    Configuration conf = job.getConfiguration();
    if ("true".equals(conf.get("output.compression.enabled"))) {
        FileOutputFormat.setCompressOutput(job, true);
        String codec = conf.get("output.compression.codec");
        FileOutputFormat.setOutputCompressorClass(job,
                PigContext.resolveClassName(codec).asSubclass(CompressionCodec.class));
    }//from w w  w  .j  a v  a 2 s  .c o m
    FileOutputFormat.setOutputPath(job, new Path(location));
}