List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput
public static void setCompressOutput(Job job, boolean compress)
From source file:cn.lhfei.hadoop.ch04.MaxTemperatureWithCompression.java
License:Apache License
public static void main(String[] args) { if (args.length != 2) { System.err.println("Usage: MaxTemperatureWithCompression <input path> " + "<output path>"); System.exit(-1);/* ww w . java 2 s. c om*/ } try { Job job = new Job(); job.setJarByClass(MaxTemperatureWithCompression.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setMapperClass(MaxTemperatureMapper.class); job.setCombinerClass(MaxTemperatureReducer.class); job.setReducerClass(MaxTemperatureReducer.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } }
From source file:com.awcoleman.ExampleJobSummaryLogWithOutput.BinRecToAvroRecDriver.java
License:Apache License
public int run(String[] args) throws Exception { String input = null;/* w ww . ja v a2s. co m*/ String output = null; if (args.length < 2) { System.err.printf("Usage: %s <input> <output>\n", this.getClass().getSimpleName()); return -1; } else { input = args[0]; output = args[1]; } Job job = Job.getInstance(getConf(), "BinRecToAvroRecDriver"); Configuration conf = job.getConfiguration(); //Add job log to hold Driver logging (and any summary info about the dataset,job, or counters we want to write) String fapath = createTempFileAppender(job); //get schema Schema outSchema = ReflectData.get().getSchema(com.awcoleman.examples.avro.BinRecForPartitions.class); job.getConfiguration().set("outSchema", outSchema.toString()); //Job conf settings job.setJarByClass(BinRecToAvroRecDriver.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(BinRecInputFormat.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); AvroJob.setOutputKeySchema(job, outSchema); AvroJob.setMapOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setMapOutputValueSchema(job, outSchema); //Job output compression FileOutputFormat.setCompressOutput(job, true); job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC); //Input and Output Paths FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); outPath.getFileSystem(conf).delete(outPath, true); boolean jobCompletionStatus = job.waitForCompletion(true); //Print Custom Counters before exiting Counters counters = job.getCounters(); for (MYJOB_CNTRS customCounter : MYJOB_CNTRS.values()) { Counter thisCounter = counters.findCounter(customCounter); System.out.println("Custom Counter " + customCounter + "=" + thisCounter.getValue()); } long mycnt1 = job.getCounters() .findCounter("com.awcoleman.TestingGettingContainerLogger.BinRecToAvroRecDriver$MYJOB_CNTRS", "MYCNT1") .getValue(); long mycnt2 = job.getCounters() .findCounter("com.awcoleman.TestingGettingContainerLogger.BinRecToAvroRecDriver$MYJOB_CNTRS", "MYCNT2") .getValue(); long mycnt3 = job.getCounters() .findCounter("com.awcoleman.TestingGettingContainerLogger.BinRecToAvroRecDriver$MYJOB_CNTRS", "MYCNT3") .getValue(); long myfakekpi = mycnt1 - mycnt2; String msgMyfakekpi = "The Fake KPI of the Dataset: " + String.format("%,d", myfakekpi); System.out.println(msgMyfakekpi); logger.info(msgMyfakekpi); //Finished, so move job log to HDFS in _log dir, clean copyTempFileAppenderToHDFSOutpath(job, fapath, output); return jobCompletionStatus ? 0 : 1; }
From source file:com.ci.backports.avro.mapreduce.AvroOutputFormat.java
License:Apache License
/** Enable output compression using the deflate codec and specify its level.*/ public static void setDeflateLevel(Job job, int level) { FileOutputFormat.setCompressOutput(job, true); job.getConfiguration().setInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY, level); }
From source file:com.cloudera.oryx.computation.common.JobStep.java
License:Open Source License
/** * Creates a new {@link MRPipeline} instance that contains common configuration * settings./*from w w w.j a va 2 s . com*/ * * @return a new {@link MRPipeline} instance, suitably configured */ protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException { Configuration conf = OryxConfiguration.get(getConf()); conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); conf.setBoolean("mapred.output.compress", true); conf.set("mapred.output.compression.type", "BLOCK"); conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Set old-style equivalents for Avro/Crunch's benefit conf.set("avro.output.codec", "snappy"); conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true); conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true); conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1); //conf.setBoolean("crunch.disable.deep.copy", true); // Giving one mapper a lot of data can cause issues in some stages, so default to disable this conf.setBoolean("crunch.disable.combine.file", true); Config appConfig = ConfigUtils.getDefaultConfig(); conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir")); int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb"); log.info("Mapper memory: {}", mapMemoryMB); int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB); if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); } conf.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); // See comment below on CM conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB); int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb"); log.info("Reducer memory: {}", reduceMemoryMB); if (isHighMemoryStep()) { reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor"); log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); } conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB); if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); } conf.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in // -Xmx appended to opts above, which is at worst redundant conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB); conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128); conf.setInt("yarn.app.mapreduce.am.resource.mb", 384); // Pass total config state conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render()); // Make sure to set any args to conf above this line! setConf(conf); Job job = Job.getInstance(conf); // Basic File IO settings FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); log.info("Created pipeline configuration {}", job.getConfiguration()); return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration()); }
From source file:com.datasalt.pangool.tuplemr.avro.AvroOutputFormat.java
License:Apache License
/** Enable output compression using the deflate codec and specify its level.*/ public static void setDeflateLevel(Job job, int level) { FileOutputFormat.setCompressOutput(job, true); job.getConfiguration().setInt(DEFLATE_LEVEL_KEY, level); }
From source file:com.github.libsml.commons.util.HadoopUtils.java
License:Apache License
/** * * @param inputPaths//from w ww . j a va2 s . c o m * @param outputPath * @param inputFormat * @param inputKey * @param inputValue * @param mapper * @param mapperKey * @param mapperValue * @param combiner * @param reducer * @param outputKey * @param outputValue * @param outputFormat * @param conf * @param overwrite * @param isCompress * @return * @throws IOException */ public static Job prepareAvroJob(String inputPaths, String outputPath, Class<? extends InputFormat> inputFormat, Object inputKey, Object inputValue, Class<? extends Mapper> mapper, Object mapperKey, Object mapperValue, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer, Object outputKey, Object outputValue, Class<? extends OutputFormat> outputFormat, Configuration conf, boolean overwrite, boolean isCompress) throws IOException { Job job = Job.getInstance(conf); Configuration jobConf = job.getConfiguration(); if (inputKey instanceof Schema) { if (inputValue instanceof Schema) { inputFormat = inputFormat == null ? AvroKeyValueInputFormat.class : inputFormat; } inputFormat = inputFormat == null ? AvroKeyInputFormat.class : inputFormat; } if (inputFormat != null) { job.setInputFormatClass(inputFormat); } if (inputKey instanceof Schema) { AvroJob.setInputKeySchema(job, (Schema) inputKey); } if (inputValue instanceof Schema) { AvroJob.setInputValueSchema(job, (Schema) inputValue); } if (outputKey instanceof Schema) { if (outputValue instanceof Schema) { outputFormat = outputFormat == null ? AvroKeyValueOutputFormat.class : outputFormat; } outputFormat = outputFormat == null ? AvroKeyOutputFormat.class : outputFormat; } if (outputFormat != null) { job.setOutputFormatClass(outputFormat); } if (outputKey instanceof Schema) { AvroJob.setOutputKeySchema(job, (Schema) outputKey); } else if (outputKey instanceof Class) { job.setOutputKeyClass((Class) outputKey); } if (outputValue instanceof Schema) { AvroJob.setOutputValueSchema(job, (Schema) outputValue); } else if (outputValue instanceof Class) { job.setOutputValueClass((Class) outputValue); } if (reducer == null) { job.setNumReduceTasks(0); if (mapperKey instanceof Schema) { AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey); } else if (mapperKey instanceof Class) { job.setOutputKeyClass((Class) mapperKey); } if (mapperValue instanceof Schema) { AvroJob.setOutputValueSchema(job, (Schema) mapperValue); } else if (mapperKey instanceof Class) { job.setOutputValueClass((Class) mapperValue); } job.setJarByClass(mapper); } else if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); } job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } FileInputFormat.setInputPaths(job, inputPaths); FileOutputFormat.setOutputPath(job, new Path(outputPath)); if (isCompress) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class); } job.setMapperClass(mapper); if (mapperKey instanceof Schema) { AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey); } else if (mapperKey instanceof Class) { job.setMapOutputKeyClass((Class) mapperKey); } if (mapperValue instanceof Schema) { AvroJob.setMapOutputValueSchema(job, (Schema) mapperValue); } else if (mapperKey instanceof Class) { job.setMapOutputValueClass((Class) mapperValue); } if (reducer != null) { job.setReducerClass(reducer); } if (combiner != null) { job.setCombinerClass(combiner); } if (overwrite) { HadoopUtils.delete(jobConf, new Path(outputPath)); } return job; }
From source file:com.github.libsml.commons.util.HadoopUtils.java
License:Apache License
public static Job prepareAvroJob(String inputPaths, Path outputPath, Schema inputKeySchema, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer, Schema outputKeySchema, Class<? extends Writable> outputValue, Configuration conf, boolean overwrite) throws IOException { Job job = Job.getInstance(conf);//from w w w . ja v a 2 s . c o m Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); } job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } FileInputFormat.setInputPaths(job, inputPaths); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class); job.setInputFormatClass(AvroKeyInputFormat.class); AvroJob.setInputKeySchema(job, inputKeySchema); job.setMapperClass(mapper); if (mapperKey != null) { job.setMapOutputKeyClass(mapperKey); } if (mapperValue != null) { job.setMapOutputValueClass(mapperValue); } if (combiner != null) { job.setCombinerClass(combiner); } job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setReducerClass(reducer); AvroJob.setOutputKeySchema(job, outputKeySchema); job.setOutputValueClass(outputValue); if (overwrite) { HadoopUtils.delete(jobConf, outputPath); } return job; }
From source file:com.hadoop.mapreduce.TestLzoLazyLoading.java
License:Open Source License
private void runWordCount(Configuration cf, boolean compressIn, boolean compressOut) throws IOException, InterruptedException, ClassNotFoundException { Configuration thisConf = new Configuration(cf); if (compressIn) { thisConf.setBoolean("mapred.compression.lzo.test.codec-checked-after-map", true); }//from ww w .j av a 2 s . co m if (compressOut) { thisConf.setBoolean("mapred.compression.lzo.test.codec-checked-after-reduce", true); } Path pathIn = new Path(TEST_ROOT_DIR + "/in"); Path pathOut = new Path(TEST_ROOT_DIR + "/out"); localFs.delete(pathIn, true); localFs.delete(pathOut, true); writeFile(makeFileName("in/part1", compressIn), "this is a test\nof word count test\ntest\n"); writeFile(makeFileName("in/part2", compressIn), "more test"); Job job = new Job(thisConf, "word count"); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (compressOut) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, LzoCodec.class); } FileInputFormat.addInputPath(job, pathIn); FileOutputFormat.setOutputPath(job, pathOut); job.submit(); assertEquals("IsLzoChecked (client)?", compressIn, LzoCodec.isNativeLzoChecked()); assertTrue(job.waitForCompletion(false)); String result = readFile(makeFileName("out/part-r-00000", compressOut)); System.out.println(result); assertEquals("a\t1\ncount\t1\nis\t1\nmore\t1\nof\t1\ntest\t4\nthis\t1\nword\t1\n", result); }
From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.AverageJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, AverageJob.class.getSimpleName()); job.setJarByClass(AverageJob.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("Sample Average Job"); job.setMapperClass(AverageMapper.class); job.setCombinerClass(AverageCombiner.class); job.setReducerClass(AverageReducer.class); job.setInputFormatClass(TextInputFormat.class); //job.setOutputFormatClass(TextOutputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); return 0;// ww w. j a v a 2 s . co m }
From source file:com.knewton.mapreduce.example.SSTableMRExample.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException, ParseException { long startTime = System.currentTimeMillis(); Options options = buildOptions();/*from w w w. j a v a 2 s . c o m*/ CommandLineParser cliParser = new BasicParser(); CommandLine cli = cliParser.parse(options, args); if (cli.getArgs().length < 2 || cli.hasOption('h')) { printUsage(options); } Job job = getJobConf(cli); job.setJarByClass(SSTableMRExample.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(StudentEventWritable.class); job.setMapperClass(StudentEventMapper.class); job.setReducerClass(StudentEventReducer.class); job.setInputFormatClass(SSTableColumnInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // input arg String inputPaths = cli.getArgs()[0]; LOG.info("Setting initial input paths to {}", inputPaths); SSTableInputFormat.addInputPaths(job, inputPaths); // output arg FileOutputFormat.setOutputPath(job, new Path(cli.getArgs()[1])); if (cli.hasOption('c')) { LOG.info("Using compression for output."); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileOutputFormat.setCompressOutput(job, true); } job.waitForCompletion(true); LOG.info("Total runtime: {}s", (System.currentTimeMillis() - startTime) / 1000); }