List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:com.cloudera.crunch.io.SourceTargetHelper.java
License:Open Source License
public static void configureTarget(Job job, Class<? extends OutputFormat> outputFormatClass, DataBridge handler, Path path, String name) { FileOutputFormat.setOutputPath(job, path); if (name == null) { job.setOutputFormatClass(outputFormatClass); job.setOutputKeyClass(handler.getKeyClass()); job.setOutputValueClass(handler.getValueClass()); } else {/*ww w .ja va 2 s . co m*/ CrunchMultipleOutputs.addNamedOutput(job, name, outputFormatClass, handler.getKeyClass(), handler.getValueClass()); } }
From source file:com.cloudera.recordservice.avro.mapreduce.ColorCount.java
License:Apache License
/** * Run the MR2 color count with generic records, and return a map of favorite colors to * the number of users.//from www. j ava2 s . c o m */ public static java.util.Map<String, Integer> countColors() throws IOException, ClassNotFoundException, InterruptedException { String output = TestUtil.getTempDirectory(); Path outputPath = new Path(output); JobConf conf = new JobConf(ColorCount.class); conf.setInt("mapreduce.job.reduces", 1); Job job = Job.getInstance(conf); job.setJarByClass(ColorCount.class); job.setJobName("MR2 Color Count With Generic Records"); RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users"); job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(Reduce.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); job.waitForCompletion(false); // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. SeekableInput input = new FsInput(new Path(output + "/part-r-00000.avro"), conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>(); for (GenericRecord datum : fileReader) { colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString())); } return colorMap; }
From source file:com.cloudera.recordservice.examples.mapreduce.MapReduceAgeCount.java
License:Apache License
public int run(String[] args) throws Exception { org.apache.log4j.BasicConfigurator.configure(); if (args.length != 2) { System.err.println("Usage: MapReduceAgeCount <input path> <output path>"); return -1; }//from w w w .j a v a2s .c o m Job job = Job.getInstance(getConf()); job.setJarByClass(MapReduceAgeCount.class); job.setJobName("Age Count"); // RECORDSERVICE: // To read from a table instead of a path, comment out // FileInputFormat.setInputPaths() and instead use: // FileInputFormat.setInputPaths(job, new Path(args[0])); RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); // RECORDSERVICE: // Use the RecordService version of the AvroKeyValueInputFormat job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyValueInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(AgeCountMapper.class); // Set schema for input key and value. AvroJob.setInputKeySchema(job, UserKey.getClassSchema()); AvroJob.setInputValueSchema(job, UserValue.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(AgeCountReducer.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.cloudera.recordservice.examples.mapreduce.MapReduceColorCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { org.apache.log4j.BasicConfigurator.configure(); if (args.length != 2) { System.err.println("Usage: MapReduceColorCount <input path> <output path>"); return -1; }/*from w w w . j av a2 s . c om*/ Job job = Job.getInstance(getConf()); job.setJarByClass(MapReduceColorCount.class); job.setJobName("Color Count"); // RECORDSERVICE: // To read from a table instead of a path, comment out // FileInputFormat.setInputPaths() and instead use: //FileInputFormat.setInputPaths(job, new Path(args[0])); RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users"); // RECORDSERVICE: // Use the RecordService version of the AvroKeyInputFormat job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class); //job.setInputFormatClass(AvroKeyInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(ColorCountMapper.class); AvroJob.setInputKeySchema(job, User.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(ColorCountReducer.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.cloudera.recordservice.examples.mapreduce.RecordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: RecordCount <input_query> <output_path>"); System.exit(1);/*from w w w . j a va2 s . c o m*/ } String inputQuery = args[0]; String output = args[1]; Job job = Job.getInstance(getConf()); job.setJobName("recordcount"); job.setJarByClass(RecordCount.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setNumReduceTasks(1); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(LongWritable.class); RecordServiceConfig.setInputQuery(job.getConfiguration(), inputQuery); job.setInputFormatClass(RecordServiceInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileSystem fs = FileSystem.get(job.getConfiguration()); Path outputPath = new Path(output); if (fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java
License:Apache License
/** * @param args the cli arguments/* w ww . ja v a 2 s . c om*/ */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); if (args.length != 2) { usage(); return 2; } setNumberOfRows(job, parseHumanLong(args[0])); Path outputDir = new Path(args[1]); if (outputDir.getFileSystem(getConf()).exists(outputDir)) { throw new IOException("Output directory " + outputDir + " already exists."); } FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraGen"); job.setJarByClass(TeraGen.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraSort.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; if (args.length != 2 && args.length != 3) { usage();/*from ww w . j av a 2 s . c o m*/ return 1; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } LOG.info("starting"); Job job = Job.getInstance(getConf()); boolean useSimplePartitioner = getUseSimplePartitioner(job); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); useSimplePartitioner = true; } else { Path inputDir = new Path(args[0]); TeraInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TeraInputFormat.class); } Path outputDir = new Path(args[1]); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TeraOutputFormat.class); if (useSimplePartitioner) { job.setPartitionerClass(SimplePartitioner.class); } else { long start = System.currentTimeMillis(); Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); try { TeraInputFormat.writePartitionFile(job, partitionFile); } catch (Throwable e) { LOG.error(e.getMessage()); return -1; } job.addCacheFile(partitionUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions."); job.setPartitionerClass(TotalOrderPartitioner.class); } job.getConfiguration().setInt("dfs.replication", getOutputReplication(job)); TeraOutputFormat.setFinalSync(job, true); int ret = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return ret; }
From source file:com.cloudera.sqoop.mapreduce.db.DBOutputFormat.java
License:Apache License
private static DBConfiguration setOutput(Job job, String tableName) throws IOException { job.setOutputFormatClass(DBOutputFormat.class); ConfigurationHelper.setJobReduceSpeculativeExecution(job, false); DBConfiguration dbConf = new DBConfiguration(job.getConfiguration()); dbConf.setOutputTableName(tableName); return dbConf; }
From source file:com.cloudera.sqoop.mapreduce.HBaseImportJob.java
License:Apache License
@Override protected void configureOutputFormat(Job job, String tableName, String tableClassName) throws ClassNotFoundException, IOException { // Use the DelegatingOutputFormat with the HBasePutProcessor. job.setOutputFormatClass(getOutputFormatClass()); Configuration conf = job.getConfiguration(); conf.setClass("sqoop.output.delegate.field.map.processor.class", HBasePutProcessor.class, FieldMapProcessor.class); // Set the HBase parameters (table, column family, row key): conf.set(HBasePutProcessor.TABLE_NAME_KEY, options.getHBaseTable()); conf.set(HBasePutProcessor.COL_FAMILY_KEY, options.getHBaseColFamily()); // What column of the input becomes the row key? String rowKeyCol = options.getHBaseRowKeyColumn(); if (null == rowKeyCol) { // User didn't explicitly set one. If there's a split-by column set, // use that. rowKeyCol = options.getSplitByCol(); }// w w w. ja va 2s .c o m if (null == rowKeyCol) { // No split-by column is explicitly set. // If the table has a primary key, use that. ConnManager manager = getContext().getConnManager(); rowKeyCol = manager.getPrimaryKey(tableName); } if (null == rowKeyCol) { // Give up here if this is still unset. throw new IOException("Could not determine the row-key column. " + "Use --hbase-row-key to specify the input column that " + "names each row."); } conf.set(HBasePutProcessor.ROW_KEY_COLUMN_KEY, rowKeyCol); }
From source file:com.cloudera.sqoop.mapreduce.MergeJob.java
License:Apache License
public boolean runMergeJob() throws IOException { Configuration conf = options.getConf(); Job job = new Job(conf); String userClassName = options.getClassName(); if (null == userClassName) { // Shouldn't get here. throw new IOException("Record class name not specified with " + "--class-name."); }/*from w ww. ja v a 2 s .c om*/ // Set the external jar to use for the job. String existingJar = options.getExistingJarName(); if (existingJar != null) { // User explicitly identified a jar path. LOG.debug("Setting job jar to user-specified jar: " + existingJar); job.getConfiguration().set("mapred.jar", existingJar); } else { // Infer it from the location of the specified class, if it's on the // classpath. try { Class<? extends Object> userClass = conf.getClassByName(userClassName); if (null != userClass) { String userJar = Jars.getJarPathForClass(userClass); LOG.debug("Setting job jar based on user class " + userClassName + ": " + userJar); job.getConfiguration().set("mapred.jar", userJar); } else { LOG.warn("Specified class " + userClassName + " is not in a jar. " + "MapReduce may not find the class"); } } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } } try { Path oldPath = new Path(options.getMergeOldPath()); Path newPath = new Path(options.getMergeNewPath()); Configuration jobConf = job.getConfiguration(); FileSystem fs = FileSystem.get(jobConf); oldPath = oldPath.makeQualified(fs); newPath = newPath.makeQualified(fs); FileInputFormat.addInputPath(job, oldPath); FileInputFormat.addInputPath(job, newPath); jobConf.set(MERGE_OLD_PATH_KEY, oldPath.toString()); jobConf.set(MERGE_NEW_PATH_KEY, newPath.toString()); jobConf.set(MERGE_KEY_COL_KEY, options.getMergeKeyCol()); jobConf.set(MERGE_SQOOP_RECORD_KEY, userClassName); FileOutputFormat.setOutputPath(job, new Path(options.getTargetDir())); if (ExportJobBase.isSequenceFiles(jobConf, newPath)) { job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(MergeRecordMapper.class); } else { job.setMapperClass(MergeTextMapper.class); job.setOutputFormatClass(RawKeyTextOutputFormat.class); } jobConf.set("mapred.output.key.class", userClassName); job.setOutputValueClass(NullWritable.class); job.setReducerClass(MergeReducer.class); // Set the intermediate data types. job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(MergeRecord.class); // Make sure Sqoop and anything else we need is on the classpath. cacheJars(job, null); return this.runJob(job); } catch (InterruptedException ie) { throw new IOException(ie); } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } }