List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass)
From source file:com.benchmark.mapred.SleepJob.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), SleepJob.class); job.setNumMapTasks(numMapper);/*from www. j a v a 2 s. c o m*/ job.setNumReduceTasks(numReducer); job.setMapperClass(SleepJob.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepJob.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(SleepJob.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:com.cloudera.avro.AvroWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: AvroWordCount <input path> <output path>"); return -1; }/* w w w .j a v a 2 s. c o m*/ JobConf conf = new JobConf(AvroWordCount.class); conf.setJobName("wordcount"); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyComparatorClass(Text.Comparator.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:com.datasalt.pangool.benchmark.urlresolution.HadoopUrlResolution.java
License:Apache License
public final static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: urlresolution <url-map> <url-register> <out>"); System.exit(2);// w ww .j av a2 s . c o m } JobConf job = new JobConf(conf); FileSystem fS = FileSystem.get(conf); fS.delete(new Path(otherArgs[2]), true); MultipleInputs.addInputPath(job, new Path(otherArgs[0]), TextInputFormat.class, UrlMapClass.class); MultipleInputs.addInputPath(job, new Path(otherArgs[1]), TextInputFormat.class, UrlRegisterMapClass.class); job.setJarByClass(HadoopUrlResolution.class); job.setPartitionerClass(KeyPartitioner.class); job.setOutputValueGroupingComparator(GroupingComparator.class); job.setMapOutputKeyClass(UrlRegJoinUrlMap.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); Job j = new Job(job); j.setReducerClass(Reduce.class); j.waitForCompletion(true); }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("m", "model", true, "location of the model"); // parse the command line arguments CommandLine line = null;//from ww w . j av a 2s . c o m try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); String model = line.getOptionValue("m"); if (line.hasOption("help")) { formatter.printHelp("ClassifierJob", options); return 0; } if (model == null | input == null | output == null) { formatter.printHelp("ClassifierJob", options); return -1; } } catch (ParseException e) { formatter.printHelp("ClassifierJob", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); String modelPath = line.getOptionValue("m"); JobConf job = new JobConf(getConf()); // push the model file to the DistributedCache DistributedCache.addCacheArchive(new URI(modelPath), job); job.setJarByClass(this.getClass()); job.setJobName("ClassifierJob : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TextClassifierMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.set(modelNameParam, modelPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.es.ESIndexerJob.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 1) { String syntax = "com.digitalpebble.behemoth.ESIndexerJob input"; System.err.println(syntax); return -1; }// ww w . ja va2 s . c o m Path inputPath = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Indexing " + inputPath + " into ElasticSearch"); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputValueClass(MapWritable.class); job.setMapperClass(BehemothToESMapper.class); job.setSpeculativeExecution(false); // disable speculative execution // when writing to ES // job.set("es.resource", "radio/artists"); // index used for storing // data job.setOutputFormat(EsOutputFormat.class); // use dedicated output // format FileInputFormat.addInputPath(job, inputPath); // no reducer : send straight to elasticsearch at end of mapping job.setNumReduceTasks(0); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("ESIndexerJob completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception while running job", e); return -1; } return 0; }
From source file:com.digitalpebble.behemoth.languageidentification.LanguageIdDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input file or directory"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("w", "overwrite", false, "overwrite the output"); Path inputPath = null;// ww w . ja va2 s. c om Path outputPath = null; boolean overWrite = false; // parse the command line arguments CommandLine cmdLine = null; try { cmdLine = parser.parse(options, args); String input = cmdLine.getOptionValue("i"); String output = cmdLine.getOptionValue("o"); if (cmdLine.hasOption("help")) { formatter.printHelp("LanguageIdDriver", options); return 0; } if (input == null | output == null) { formatter.printHelp("LanguageIdDriver", options); return -1; } inputPath = new Path(input); outputPath = new Path(output); if (cmdLine.hasOption("overwrite")) { overWrite = true; } } catch (ParseException e) { formatter.printHelp("LanguageIdDriver", options); } // check whether needs overwriting if (FileSystem.get(outputPath.toUri(), getConf()).exists(outputPath)) { if (!overWrite) { System.out.println("Output path " + outputPath + " already exists. Use option -w to overwrite."); return 0; } else fs.delete(outputPath, true); } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Processing with Language Identifier"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(LanguageIdMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("LanguagedIdDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { log.error(e.getMessage(), e); fs.delete(outputPath, true); return -1; } finally { } return 0; }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int convert(Path vectorPath, Path labelPath, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, vectorPath); FileInputFormat.addInputPath(job, labelPath); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IdentityMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // 1 reducers job.setNumReduceTasks(1);//from w w w .j a v a 2 s .c o m job.setReducerClass(Mahout2LibSVM.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); boolean success = rj.isSuccessful(); if (!success) return -1; if (log.isInfoEnabled()) { log.info("Conversion: done"); } return 0; }
From source file:com.digitalpebble.behemoth.tika.TikaDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); GroupBuilder gBuilder = new GroupBuilder().withName("Options:"); List<Option> options = new ArrayList<Option>(); Option inputOpt = buildOption("input", "i", "The input path", true, true, null); options.add(inputOpt);/*from w w w . java 2 s . c o m*/ Option outOpt = buildOption("output", "o", "The output path", true, true, null); options.add(outOpt); Option tikaOpt = buildOption("tikaProcessor", "t", "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true, false, null); options.add(tikaOpt); Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, ""); options.add(mimeTypeOpt); for (Option opt : options) { gBuilder = gBuilder.withOption(opt); } Group group = gBuilder.create(); try { Parser parser = new Parser(); parser.setGroup(group); // TODO catch exceptions with parsing of opts CommandLine cmdLine = parser.parse(args); Path inputPath = new Path(cmdLine.getValue(inputOpt).toString()); Path outputPath = new Path(cmdLine.getValue(outOpt).toString()); String handlerName = null; if (cmdLine.hasOption(tikaOpt)) { handlerName = cmdLine.getValue(tikaOpt).toString(); } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); if (cmdLine.hasOption(mimeTypeOpt)) { String mimeType = cmdLine.getValue(mimeTypeOpt).toString(); job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType); } if (handlerName != null && handlerName.equals("") == false) { job.set(TIKA_PROCESSOR_KEY, handlerName); } job.setJobName("Tika : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TikaMapper.class); boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("TikaDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { log.error("Exception", e); return -1; // don't delete the output as some of it could be used // fs.delete(outputPath, true); } finally { } } catch (OptionException e) { log.error("OptionException", e.getMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.setGroup(group); formatter.print(); return -1; } return 0; }
From source file:com.digitalpebble.behemoth.uima.UIMADriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length != 3) { String syntax = "com.digitalpebble.behemoth.uima.UIMADriver in out path_pear_file"; System.err.println(syntax); return -1; }/*from w ww . java 2s . c om*/ Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); String pearPath = args[2]; // check that the GATE application has been stored on HDFS Path zap = new Path(pearPath); if (fs.exists(zap) == false) { System.err.println("The UIMA application " + pearPath + "can't be found on HDFS - aborting"); return -1; } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Processing with UIMA application : " + pearPath); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(UIMAMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // push the UIMA pear onto the DistributedCache DistributedCache.addCacheFile(new URI(pearPath), job); job.set("uima.pear.path", pearPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("UIMADriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception", e); fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.util.CorpusFilter.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;//from w ww . j av a 2 s. c o m try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); if (line.hasOption("help")) { formatter.printHelp("CorpusFilter", options); return 0; } if (input == null | output == null) { formatter.printHelp("CorpusFilter", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusFilter", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("CorpusFilter : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); boolean isFilterRequired = BehemothMapper.isRequired(job); // should be the case here if (!isFilterRequired) { System.err.println("No filters configured. Check your behemoth-site.xml"); return -1; } job.setMapperClass(BehemothMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); fs.delete(outputPath, true); } finally { } return 0; }