List of usage examples for org.apache.hadoop.mapreduce Job getInstance
public static Job getInstance() throws IOException
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.SimpleTextSearch.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(); job.setJarByClass(SimpleTextSearch.class); job.setJobName(SimpleTextSearch.class.getName()); // mapper/*from w w w . j a v a2s . com*/ job.setMapperClass(TextSearchMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // combiner + reducer job.setCombinerClass(TextLongCountingReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; // regex with a phrase to be searched for String regex = otherArgs[2]; job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.WordCounterExample.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(); job.setJarByClass(WordCounterExample.class); job.setJobName(WordCounterExample.class.getName()); // mapper/* w w w .j a va2 s.co m*/ job.setMapperClass(WordCounterMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // combiner + reducer job.setCombinerClass(TextLongCountingReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.TopDomainCounter.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(); job.setJarByClass(TopDomainCounter.class); job.setJobName(TopDomainCounter.class.getName()); // mapper/*from w w w. j a v a 2 s . co m*/ job.setMapperClass(DomainMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // combiner + reducer job.setCombinerClass(TextLongCountingReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.experiments.dip.hadoop.OriginalURLGrep.java
License:Apache License
@Override public int run(String[] args) throws Exception { org.apache.hadoop.conf.Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); System.out.println("Other args: " + Arrays.toString(otherArgs)); Job job = Job.getInstance(); job.setJarByClass(OriginalURLGrep.class); job.setJobName(OriginalURLGrep.class.getName()); job.setMapperClass(OrigURLGrepMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // cache file - IDs for index String idFile = args[2];/* w w w. java2 s . c o m*/ System.err.println("idFile: " + idFile); job.addCacheFile(new URI(idFile + "#" + NODE_IDS)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; System.err.println("commaSeparatedInputFiles: " + commaSeparatedInputFiles); System.err.println("outputPath: " + outputPath); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }