List of usage examples for org.apache.hadoop.mapreduce Job setJobName
public void setJobName(String name) throws IllegalStateException
From source file:edu.udel.mxv.Mxv.java
@Override public int run(String[] args) throws Exception { if (args.length != 4) { System.err.println(USAGE); System.exit(1);// w w w. jav a 2 s. c o m } int n = Integer.parseInt(args[0]); String input_matrix = args[1]; String input_vector = args[2]; String output = args[3]; Configuration conf = getConf(); conf.set("vector.path", input_vector); conf.setInt("vector.n", n); Job job = new Job(conf); job.setJobName("mxv"); job.setJarByClass(getClass()); // mapper job.setMapperClass(MxvMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DoubleWritable.class); // reducer job.setReducerClass(MxvRed.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleWritable.class); // job.setNumReduceTasks(num_red); FileInputFormat.addInputPath(job, new Path(input_matrix)); FileOutputFormat.setOutputPath(job, new Path(output)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.umd.cloud9.collection.ExtractHTMLFieldCollection.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) @Override/*from ww w . j a v a 2 s . c o m*/ public int runTool() throws Exception { Configuration conf = getConf(); Job job = new Job(conf); String inputPath = conf.get("Cloud9.InputPath"); String inputFormat = conf.get("Cloud9.InputFormat"); String outputPath = conf.get("Cloud9.OutputPath"); String tag = conf.get("Cloud9.TargetTag"); job.setJobName("ExtractFieldCollection"); job.setJarByClass(ExtractHTMLFieldCollection.class); job.setMapperClass(MyMapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(200); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(TextDocument.class); LOG.info("ExtractFieldCollection - " + tag); LOG.info(" - Input path: " + inputPath); LOG.info(" - Input format: " + inputFormat); LOG.info(" - Output path: " + outputPath); LOG.info(" - Target tag: " + tag); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase1(String inputPath, int reduceNo, String lang) throws IOException, InterruptedException, ClassNotFoundException { String output = "tmp/wiki-link/phase1"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 1"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);// www . j av a 2 s . c o m FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); if ("en".equals(lang)) { job.setInputFormatClass(WikipediaPageInputFormat.class); } else throw new InterruptedException("Wikipedia dump with language " + lang + " is not supported "); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfStringInt.class); job.setMapperClass(LinkEmitMapClass.class); job.setReducerClass(RedirectResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase2(String inputPath, int reduceNo) throws IOException, InterruptedException, ClassNotFoundException { String output = "tmp/wiki-link/phase2"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 2"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);// w w w .j ava 2 s . c o m FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfStringInt.class); job.setReducerClass(DestinationIdResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase3(String inputPath, int reduceNo) throws IOException, InterruptedException, ClassNotFoundException { String output = "trace/phase3"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 3"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);/*from w w w .ja v a 2 s. c om*/ FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(SourceIdResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.CountWikipediaPages.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from ww w . j a va 2 s. c o m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = "en"; // Assume 'en' by default. if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - language: " + language); Job job = Job.getInstance(getConf()); job.setJarByClass(CountWikipediaPages.class); job.setJobName(String.format("CountWikipediaPages[%s: %s, %s: %s]", INPUT_OPTION, inputPath, LANGUAGE_OPTION, language)); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); if (language != null) { job.getConfiguration().set("wiki.language", language); } job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(MyMapper.class); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.DumpWikipediaToPlainText.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from ww w . ja v a2s . co m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); options.addOption(OptionBuilder.withArgName("TEXT|HTML|WIKI").hasArg() .withDescription("Output Content Type TEXT, HTML, WIKI").create(CONTENT_FORMAT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = "en"; // Assume "en" by default. if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String contentFormat = "TEXT"; // Assume "TEXT" by default. if (cmdline.hasOption(CONTENT_FORMAT_OPTION)) { contentFormat = cmdline.getOptionValue(CONTENT_FORMAT_OPTION); if (!contentFormat.equals("TEXT") && !contentFormat.equals("HTML") && !contentFormat.equals("WIKI")) { System.err.println("Error: \"" + contentFormat + "\" unknown content type!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path : " + outputPath); LOG.info(" - language : " + language); LOG.info(" - content_type : " + contentFormat); Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(DumpWikipediaToPlainText.class); job.setJobName(String.format("DumpWikipediaToPlainText[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, LANGUAGE_OPTION, language, CONTENT_FORMAT_OPTION, contentFormat)); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); if (language != null) { job.getConfiguration().set("wiki.language", language); } if (contentFormat != null) { job.getConfiguration().set("wiki.content_format", contentFormat); } job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java
License:Apache License
private void task1(String inputPath, String outputPath) throws IOException, ClassNotFoundException, InterruptedException { LOG.info("Exracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Job job = Job.getInstance(getConf()); job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class); job.setJobName( String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. job.setNumReduceTasks(10);/* w w w .ja va2 s. com*/ // increase heap job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set("mapreduce.map.memory.mb", "6144"); job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144"); job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfStringInt.class); job.setMapOutputValueClass(PairOfStrings.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PairOfIntString.class); job.setMapperClass(MyMapper1.class); job.setReducerClass(MyReducer1.class); job.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); }
From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java
License:Apache License
private void task2(String inputPath, String outputPath) throws IOException, ClassNotFoundException, InterruptedException { LOG.info("Exracting anchor text (phase 2)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Job job = Job.getInstance(getConf()); job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class); job.setJobName( String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath)); // Gathers everything together for convenience; feasible for Wikipedia. job.setNumReduceTasks(1);/* w w w .ja v a 2s.com*/ // increase heap job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set("mapreduce.map.memory.mb", "6144"); job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144"); job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(HMapSIW.class); job.setMapperClass(MyMapper2.class); job.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); // Clean up intermediate data. FileSystem.get(job.getConfiguration()).delete(new Path(inputPath), true); }
From source file:edu.umd.cloud9.collection.wikipedia.RepackWikipedia.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from w ww .ja va 2 s . co m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file") .create(MAPPING_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("block|record|none").hasArg() .withDescription("compression type").create(COMPRESSION_TYPE_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION); String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION); if (!"block".equals(compressionType) && !"record".equals(compressionType) && !"none".equals(compressionType)) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { // Added length check for 6 to include languages like zh_yue System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } // this is the default block size int blocksize = 1000000; Job job = Job.getInstance(getConf()); job.setJarByClass(RepackWikipedia.class); job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language)); job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno mapping data file: " + mappingFile); LOG.info(" - compression type: " + compressionType); LOG.info(" - language: " + language); if ("block".equals(compressionType)) { LOG.info(" - block size: " + blocksize); } job.setNumReduceTasks(0); SequenceFileInputFormat.addInputPath(job, new Path(inputPath)); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); if ("none".equals(compressionType)) { SequenceFileOutputFormat.setCompressOutput(job, false); } else { SequenceFileOutputFormat.setCompressOutput(job, true); if ("record".equals(compressionType)) { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize); } } if (language != null) { job.getConfiguration().set("wiki.language", language); } job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language)); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }