List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.cloudera.recordservice.avro.AvroJob.java
License:Apache License
public static void setInputFormatClass(org.apache.hadoop.mapreduce.Job job, Class<? extends org.apache.hadoop.mapreduce.InputFormat> c) { if (job.getConfiguration().getBoolean(USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, false)) { if (c.getName().equals(org.apache.avro.mapreduce.AvroKeyInputFormat.class.getName())) { c = com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class; } else if (c.getName().equals(org.apache.avro.mapreduce.AvroKeyValueInputFormat.class.getName())) { c = com.cloudera.recordservice.avro.mapreduce.AvroKeyValueInputFormat.class; } else {/* ww w. j ava 2 s.com*/ throw new RuntimeException("Class '" + c.getName() + "' is not supported by " + "the RecordService. Use AvroKeyValueInputFormat or " + "AvroKeyInputFormat or disable RecordService."); } } LOG.debug("Using input format: " + c.getName()); job.setInputFormatClass(c); }
From source file:com.cloudera.recordservice.avro.mapreduce.ColorCount.java
License:Apache License
/** * Run the MR2 color count with generic records, and return a map of favorite colors to * the number of users.//from ww w . j av a 2 s . c om */ public static java.util.Map<String, Integer> countColors() throws IOException, ClassNotFoundException, InterruptedException { String output = TestUtil.getTempDirectory(); Path outputPath = new Path(output); JobConf conf = new JobConf(ColorCount.class); conf.setInt("mapreduce.job.reduces", 1); Job job = Job.getInstance(conf); job.setJarByClass(ColorCount.class); job.setJobName("MR2 Color Count With Generic Records"); RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users"); job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(Reduce.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); job.waitForCompletion(false); // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. SeekableInput input = new FsInput(new Path(output + "/part-r-00000.avro"), conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>(); for (GenericRecord datum : fileReader) { colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString())); } return colorMap; }
From source file:com.cloudera.recordservice.examples.mapreduce.MapReduceAgeCount.java
License:Apache License
public int run(String[] args) throws Exception { org.apache.log4j.BasicConfigurator.configure(); if (args.length != 2) { System.err.println("Usage: MapReduceAgeCount <input path> <output path>"); return -1; }/*www. ja va 2 s.co m*/ Job job = Job.getInstance(getConf()); job.setJarByClass(MapReduceAgeCount.class); job.setJobName("Age Count"); // RECORDSERVICE: // To read from a table instead of a path, comment out // FileInputFormat.setInputPaths() and instead use: // FileInputFormat.setInputPaths(job, new Path(args[0])); RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); // RECORDSERVICE: // Use the RecordService version of the AvroKeyValueInputFormat job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyValueInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(AgeCountMapper.class); // Set schema for input key and value. AvroJob.setInputKeySchema(job, UserKey.getClassSchema()); AvroJob.setInputValueSchema(job, UserValue.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(AgeCountReducer.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.cloudera.recordservice.examples.mapreduce.MapReduceColorCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { org.apache.log4j.BasicConfigurator.configure(); if (args.length != 2) { System.err.println("Usage: MapReduceColorCount <input path> <output path>"); return -1; }/*from ww w . j a va2 s . c o m*/ Job job = Job.getInstance(getConf()); job.setJarByClass(MapReduceColorCount.class); job.setJobName("Color Count"); // RECORDSERVICE: // To read from a table instead of a path, comment out // FileInputFormat.setInputPaths() and instead use: //FileInputFormat.setInputPaths(job, new Path(args[0])); RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users"); // RECORDSERVICE: // Use the RecordService version of the AvroKeyInputFormat job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class); //job.setInputFormatClass(AvroKeyInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(ColorCountMapper.class); AvroJob.setInputKeySchema(job, User.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(ColorCountReducer.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.cloudera.recordservice.examples.mapreduce.RecordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: RecordCount <input_query> <output_path>"); System.exit(1);//from w ww . ja va 2 s . c om } String inputQuery = args[0]; String output = args[1]; Job job = Job.getInstance(getConf()); job.setJobName("recordcount"); job.setJarByClass(RecordCount.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setNumReduceTasks(1); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(LongWritable.class); RecordServiceConfig.setInputQuery(job.getConfiguration(), inputQuery); job.setInputFormatClass(RecordServiceInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileSystem fs = FileSystem.get(job.getConfiguration()); Path outputPath = new Path(output); if (fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraChecksum.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; Job job = Job.getInstance(getConf()); if (args.length != 2 && args.length != 3) { usage();//w ww.jav a 2s . c o m return 2; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSum"); job.setJarByClass(TeraChecksum.class); job.setMapperClass(ChecksumMapper.class); job.setReducerClass(ChecksumReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Unsigned16.class); // force a single reducer job.setNumReduceTasks(1); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); } else { TeraInputFormat.setInputPaths(job, new Path(args[0])); job.setInputFormatClass(TeraInputFormat.class); } return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java
License:Apache License
/** * @param args the cli arguments/*from ww w. j ava 2 s . c o m*/ */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); if (args.length != 2) { usage(); return 2; } setNumberOfRows(job, parseHumanLong(args[0])); Path outputDir = new Path(args[1]); if (outputDir.getFileSystem(getConf()).exists(outputDir)) { throw new IOException("Output directory " + outputDir + " already exists."); } FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraGen"); job.setJarByClass(TeraGen.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraSort.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; if (args.length != 2 && args.length != 3) { usage();// w w w . j a va2s . c o m return 1; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } LOG.info("starting"); Job job = Job.getInstance(getConf()); boolean useSimplePartitioner = getUseSimplePartitioner(job); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); useSimplePartitioner = true; } else { Path inputDir = new Path(args[0]); TeraInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TeraInputFormat.class); } Path outputDir = new Path(args[1]); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TeraOutputFormat.class); if (useSimplePartitioner) { job.setPartitionerClass(SimplePartitioner.class); } else { long start = System.currentTimeMillis(); Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); try { TeraInputFormat.writePartitionFile(job, partitionFile); } catch (Throwable e) { LOG.error(e.getMessage()); return -1; } job.addCacheFile(partitionUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions."); job.setPartitionerClass(TotalOrderPartitioner.class); } job.getConfiguration().setInt("dfs.replication", getOutputReplication(job)); TeraOutputFormat.setFinalSync(job, true); int ret = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return ret; }
From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; if (args.length != 2 && args.length != 3) { usage();/*from w ww. ja va2 s. c o m*/ return 1; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } Job job = Job.getInstance(getConf()); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraValidate"); job.setJarByClass(TeraValidate.class); job.setMapperClass(ValidateMapper.class); job.setReducerClass(ValidateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // force a single reducer job.setNumReduceTasks(1); // force a single split FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); } else { TeraInputFormat.setInputPaths(job, new Path(args[0])); job.setInputFormatClass(TeraInputFormat.class); } return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.hcatalog.mapreduce.HCatRSInputFormat.java
License:Apache License
/** * Initializes the input with a provided filter. * See {@link #setInput(Configuration, String, String, String)} *///from w ww .ja va 2 s.com public static HCatRSInputFormat setInput(Job job, String location, String filter) throws IOException { Configuration conf = job.getConfiguration(); String kerberosPrincipal = conf.get(ConfVars.KERBEROS_PRINCIPAL_CONF.name); Pair<String, String> dbTablePair = HCatUtil.getDbAndTableName(location); dbTablePair = HCatRSUtil.cleanQueryPair(dbTablePair); String dbName = dbTablePair.first; String tableName = dbTablePair.second; if (location.toLowerCase().startsWith("select")) { RecordServiceConfig.setInputQuery(conf, location); } else { RecordServiceConfig.setInputTable(conf, dbName, tableName); } Credentials credentials = job.getCredentials(); RecordServicePlannerClient.Builder builder = PlanUtil.getBuilder(conf); List<NetworkAddress> plannerHosts = PlanUtil.getPlannerHostPorts(conf); RecordServicePlannerClient planner = PlanUtil.getPlanner(conf, builder, plannerHosts, kerberosPrincipal, credentials); try { if (planner.isKerberosAuthenticated()) { Token<DelegationTokenIdentifier> delegationToken = TokenUtils .fromTDelegationToken(planner.getDelegationToken("")); credentials.addToken(DelegationTokenIdentifier.DELEGATION_KIND, delegationToken); } } catch (RecordServiceException e) { throw new IOException(e); } finally { if (planner != null) planner.close(); } job.setInputFormatClass(HCatRSInputFormat.class); return setInput(conf, dbName, tableName, filter); }