List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat
public void setOutputFormat(Class<? extends OutputFormat> theClass)
From source file:com.liveramp.hank.cascading.DomainBuilderTap.java
License:Apache License
public void sinkConfInit(FlowProcess<JobConf> process, JobConf conf) { super.sinkConfInit(process, conf); // Output Format conf.setOutputFormat(this.outputFormatClass); // Output Committer conf.setOutputCommitter(DomainBuilderOutputCommitter.class); // Set this tap's Domain name locally in the conf if (conf.get(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME) != null) { throw new RuntimeException("Trying to set domain name configuration parameter to " + domainName + " but it was previously set to " + conf.get(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME)); } else {//from w w w . ja v a 2 s.co m conf.set(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME, domainName); } }
From source file:com.liveramp.hank.hadoop.AbstractHadoopDomainBuilder.java
License:Apache License
private void configureJobCommon(DomainBuilderProperties properties, int versionNumber, int numPartitions, JobConf conf) throws IOException { // Hank specific configuration properties.setJobConfProperties(conf, versionNumber); // Output Committer conf.setOutputCommitter(DomainBuilderOutputCommitter.class); // Output path (set to tmp output path) FileOutputFormat.setOutputPath(conf, new Path(properties.getTmpOutputPath(versionNumber))); // Output format conf.setOutputFormat(properties.getOutputFormatClass()); // Num reduce tasks conf.setNumReduceTasks(numPartitions); }
From source file:com.manning.hip.ch4.joins.improved.impl.OptimizedDataJoinJob.java
License:Apache License
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0];//from ww w .j ava 2 s .c om String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, OptimizedDataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir)); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setPartitionerClass(CompositeKeyPartitioner.class); job.setOutputKeyComparatorClass(CompositeKeyComparator.class); job.setOutputValueGroupingComparator(CompositeKeyOnlyComparator.class); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }
From source file:com.maxpoint.cascading.avro.AvroScheme.java
License:Open Source License
@Override public void sinkConfInit(FlowProcess<JobConf> process, Tap<JobConf, RecordReader<AvroWrapper<Record>, Writable>, OutputCollector<AvroWrapper<Record>, Writable>> tap, JobConf conf) { conf.set(AvroJob.OUTPUT_SCHEMA, dataSchema.toString()); conf.setOutputFormat(AvroOutputFormat.class); conf.setOutputKeyClass(AvroWrapper.class); // set compression AvroOutputFormat.setDeflateLevel(conf, 6); AvroJob.setOutputCodec(conf, DataFileConstants.DEFLATE_CODEC); AvroOutputFormat.setSyncInterval(conf, 1048576); }
From source file:com.mh2c.WikipediaDumpLoaderDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // arg checks JobConf conf = new JobConf(getClass()); conf.setJobName("WP dump loader"); // Set the mapper class, but skip the reduce phase conf.setMapperClass(WikipediaDumpLoaderMapper.class); conf.setNumReduceTasks(0);//from ww w .ja va 2s . co m // The object key/value pairs are text conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); // Stream XML into the job conf.setInputFormat(StreamInputFormat.class); StreamInputFormat.addInputPath(conf, new Path(args[0])); // Use the XML record reader, with each page as one record conf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader"); conf.set("stream.recordreader.begin", "<page>"); conf.set("stream.recordreader.end", "</page>"); // Emit sequence files conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:com.mh2c.WikipediaWordCountDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // arg checks JobConf conf = new JobConf(getClass()); conf.setJobName("WP word count"); // Set the mapper and reducer classes, and use the reducer as a combiner conf.setMapperClass(WikipediaWordCountMapper.class); conf.setReducerClass(WikipediaWordCountReducer.class); conf.setCombinerClass(WikipediaWordCountReducer.class); // The object key/value pairs are text words and integer counts conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); // Read in sequence files conf.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(conf, new Path(args[0])); // Emit ordinary text files conf.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);/* w w w. j a va2 s . com*/ return 0; }
From source file:com.mongodb.hadoop.examples.treasury.TreasuryYieldXMLConfigV2.java
License:Apache License
public int run(final String[] args) throws Exception { final Configuration conf = getConf(); final JobConf job = new JobConf(conf); job.setReducerClass(TreasuryYieldReducerV2.class); job.setMapperClass(TreasuryYieldMapperV2.class); job.setOutputFormat(MongoOutputFormat.class); job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf)); job.setMapOutputKeyClass(MongoConfigUtil.getMapperOutputKey(conf)); job.setMapOutputValueClass(MongoConfigUtil.getMapperOutputValue(conf)); job.setInputFormat(MongoInputFormat.class); JobClient.runJob(job);/*from ww w. ja v a 2s .c o m*/ return 0; }
From source file:com.mongodb.hadoop.util.MongoTool.java
License:Apache License
private int runMapredJob(final Configuration conf) { final JobConf job = new JobConf(conf, getClass()); /**//from w ww . j av a2 s . com * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends org.apache.hadoop.mapred.Mapper> mapper = MapredMongoConfigUtil.getMapper(conf); LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MapredMongoConfigUtil.INPUT_URI)); job.setMapperClass(mapper); Class<? extends org.apache.hadoop.mapred.Reducer> combiner = MapredMongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MapredMongoConfigUtil.getReducer(conf)); job.setOutputFormat(MapredMongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MapredMongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MapredMongoConfigUtil.getOutputValue(conf)); job.setInputFormat(MapredMongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MapredMongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MapredMongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MapredMongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MapredMongoConfigUtil.isJobBackground(conf); try { RunningJob runningJob = JobClient.runJob(job); if (background) { LOG.info("Setting up and running MapReduce job in background."); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); runningJob.waitForCompletion(); return 0; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:com.mycompany.mavenproject1.App.java
public static void main(String[] args) throws IOException { // give time to attach debugger try {/*from w w w .j a v a 2 s.c om*/ Thread.sleep(8000); } catch (InterruptedException ex) { Logger.getLogger(App.class.getName()).log(Level.SEVERE, null, ex); } JobConf conf = new JobConf(App.class); // purge existing output file FileSystem fs = FileSystem.get(conf); fs.delete(new Path(args[1]), true); // delete file, true for recursive conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(WholeFileInputFormat.class); // conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
From source file:com.mycompany.MyHadoopSamples1.TransposeJob.java
License:Apache License
public static Configuration buildTransposeJobConf(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath, int numInputRows) throws IOException { JobConf conf = new JobConf(initialConf, TransposeJob.class); conf.setJobName("TransposeJob: " + matrixInputPath + " transpose -> " + matrixOutputPath); FileSystem fs = FileSystem.get(conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); conf.setInt(NUM_ROWS_KEY, numInputRows); FileInputFormat.addInputPath(conf, matrixInputPath); conf.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(conf, matrixOutputPath); System.out.println("OUTPUT --> " + matrixOutputPath.toString()); conf.setMapperClass(TransposeMapper.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(VectorWritable.class); conf.setCombinerClass(MergeVectorsCombiner.class); conf.setReducerClass(MergeVectorsReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); return conf;//from w w w .ja v a 2 s .com }