List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:com.chriscx.matching.Driver.java
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), com.chriscx.mapred.Driver.class); conf.setJobName("Matching"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { if ("-skip".equals(args[i])) { DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf); conf.setBoolean("wordcount.skip.patterns", true); } else {//from w ww . jav a 2 s . com other_args.add(args[i]); } } FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.cloudera.avro.AvroWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: AvroWordCount <input path> <output path>"); return -1; }//from w w w.j a v a 2 s . co m JobConf conf = new JobConf(AvroWordCount.class); conf.setJobName("wordcount"); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyComparatorClass(Text.Comparator.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:com.cloudera.hbase.OldWordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);// w w w . j a v a 2s .c o m } conf.setJobName("word count"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(conf, new Path(otherArgs[1])); JobClient.runJob(conf); return; }
From source file:com.cloudera.recordservice.avro.AvroJob.java
License:Apache License
public static void setInputFormat(org.apache.hadoop.mapred.JobConf job, Class<? extends org.apache.hadoop.mapred.InputFormat> c) { if (job.getBoolean(USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, false)) { if (c.getName().equals(org.apache.avro.mapred.AvroInputFormat.class.getName())) { c = com.cloudera.recordservice.avro.mapred.AvroInputFormat.class; } else {/*from www . j ava 2 s .c om*/ throw new RuntimeException("Class '" + c.getName() + "' is not supported " + "by the RecordService. Use AvroInputFormat or disable RecordService."); } } LOG.debug("Using input format: " + c.getName()); job.setInputFormat(c); }
From source file:com.cloudera.recordservice.examples.mapreduce.WordCount.java
License:Apache License
public void run(String[] args) throws Exception { boolean useRecordService = true; if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } else if (args.length != 2) { System.err.println("Usage: WordCount <input path> <output path>"); System.exit(-1);/*from ww w. jav a 2 s . c o m*/ } String input = args[0].trim(); String output = args[1]; JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount-" + (useRecordService ? "with" : "without") + "-RecordService"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); if (useRecordService) { conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class); RecordServiceConfig.setInput(conf, input); } else { conf.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); } FileSystem fs = FileSystem.get(conf); Path outputPath = new Path(output); if (fs.exists(outputPath)) fs.delete(outputPath, true); conf.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(conf, outputPath); JobClient.runJob(conf); System.out.println("Done"); }
From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java
License:Apache License
private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf, InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits, TableDesc table, List<InputSplit> result) throws IOException { Utilities.copyTableJobPropertiesToConf(table, conf); // The table and database name to scan. // TODO: This is commented out until we have a pluggable way to configure the // SerDe. Until then, the create a separate table and set the job conf properties. // String fqTblName[] = table.getTableName().split("\\."); ////w w w . ja va 2 s .c o m // conf.set("recordservice.table.name", table.getTableName()); if (tableScan != null) { pushFilters(conf, tableScan); // Set the projected column and table info for the RecordServiceRecordReader. conf.set("recordservice.col.names", Joiner.on(",").join(tableScan.getNeededColumns())); } // Unset the file config. We're going to be just reading from the table. conf.unset(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR); conf.setInputFormat(inputFormat.getClass()); // Generate the RecordService InputSplit[] iss = inputFormat.getSplits(conf, splits); for (InputSplit is : iss) { if (is instanceof FileSplit) { FileSplit fileSplit = (FileSplit) is; LOG.info("INPUT SPLIT: " + fileSplit.getPath().toString()); } } // Wrap the InputSplits in HiveInputSplits. We use modified version of the // HiveInputSplit to work around some issues with the base one. // TODO: Get changes incorporated into Hive. for (InputSplit is : iss) { result.add(new HiveInputSplitShim(dirs.get(0), is, inputFormatClass.getName())); } }
From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java
License:Apache License
public static long countRecords(String path) throws IOException { String output = TestUtil.getTempDirectory(); Path inputPath = new Path(path); Path outputPath = new Path(output); JobConf conf = new JobConf(RecordCount.class); conf.setJobName("recordcount"); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setInt("mapreduce.job.reduces", 1); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); JobClient.runJob(conf);// w w w .j a v a2 s.c om // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. FileSystem fs = outputPath.getFileSystem(conf); FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000")); byte[] bytes = new byte[16]; int length = resultStream.read(bytes); String result = new String(bytes, 0, length).trim(); return Long.parseLong(result); }
From source file:com.cloudera.recordservice.tests.TestMiniClusterController.java
License:Apache License
public static void fillInWordCountMRJobConf(JobConf conf) { String input = "select n_comment from tpch.nation"; conf.setJobName("samplejob-wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); com.cloudera.recordservice.mr.RecordServiceConfig.setInputQuery(conf, input); setRandomOutputDir(conf);/*from ww w .j a v a2s. c o m*/ }
From source file:com.csiro.hadoop.UFORecord.java
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2);/* w w w . j a va2s . com*/ } org.apache.hadoop.mapred.FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); org.apache.hadoop.mapred.FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; }
From source file:com.datascience.cascading.scheme.CsvScheme.java
License:Apache License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setInputFormat(CsvInputFormat.class); configureReaderFormat(format, conf); }