List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.SimpleTextSearch.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(); job.setJarByClass(SimpleTextSearch.class); job.setJobName(SimpleTextSearch.class.getName()); // mapper/*from w w w . j ava 2s.c o m*/ job.setMapperClass(TextSearchMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // combiner + reducer job.setCombinerClass(TextLongCountingReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; // regex with a phrase to be searched for String regex = otherArgs[2]; job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.WordCounterExample.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(); job.setJarByClass(WordCounterExample.class); job.setJobName(WordCounterExample.class.getName()); // mapper/*w w w . jav a2 s. c o m*/ job.setMapperClass(WordCounterMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // combiner + reducer job.setCombinerClass(TextLongCountingReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.TopDomainCounter.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(); job.setJarByClass(TopDomainCounter.class); job.setJobName(TopDomainCounter.class.getName()); // mapper/*from ww w . j a v a 2s .c o m*/ job.setMapperClass(DomainMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // combiner + reducer job.setCombinerClass(TextLongCountingReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.WARCRecordCounter.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); System.out.println("Other args: " + Arrays.toString(otherArgs)); Job job = Job.getInstance(conf); job.setJarByClass(WARCRecordCounter.class); job.setJobName(WARCRecordCounter.class.getName()); // mapper/*from www. j av a 2 s . co m*/ job.setMapperClass(ResponseMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // combiner + reducer job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:distributed.hadoop.MapReduceJobConfig.java
License:Open Source License
/** * Apply the settings encapsulated in this config and return a Job object * ready for execution.//from ww w. j a va2 s .c o m * * @param jobName the name of the job * @param conf the Configuration object that will be wrapped in the Job * @param env environment variables * @return a configured Job object * @throws IOException if a problem occurs * @throws ClassNotFoundException if various classes are not found */ public Job configureForHadoop(String jobName, Configuration conf, Environment env) throws IOException, ClassNotFoundException { String jobTrackerPort = getJobTrackerPort(); if (DistributedJobConfig.isEmpty(jobTrackerPort)) { jobTrackerPort = AbstractHadoopJobConfig.isHadoop2() ? AbstractHadoopJobConfig.DEFAULT_PORT_YARN : AbstractHadoopJobConfig.DEFAULT_PORT; } String jobTracker = getJobTrackerHost() + ":" + jobTrackerPort; if (DistributedJobConfig.isEmpty(jobTracker)) { System.err.println("No " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager " : "JobTracker ") + "set - running locally..."); } else { jobTracker = environmentSubstitute(jobTracker, env); if (AbstractHadoopJobConfig.isHadoop2()) { conf.set(YARN_RESOURCE_MANAGER_ADDRESS, jobTracker); conf.set(YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS, environmentSubstitute(getJobTrackerHost(), env) + ":8030"); } else { conf.set(HADOOP_JOB_TRACKER_HOST, jobTracker); } } System.err.println("Using " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager: " : "jobtracker: ") + jobTracker); if (AbstractHadoopJobConfig.isHadoop2()) { // a few other properties needed to run against Yarn conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle"); conf.set("mapreduce.framework.name", "yarn"); } if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) { conf.set(AbstractHadoopJobConfig.isHadoop2() ? HADOOP2_MAPRED_MAX_SPLIT_SIZE : HADOOP_MAPRED_MAX_SPLIT_SIZE, getMapredMaxSplitSize()); } // Do any user supplied properties here before creating the Job for (Map.Entry<String, String> e : m_additionalUserSuppliedProperties.entrySet()) { conf.set(e.getKey(), e.getValue()); } m_hdfsConfig.configureForHadoop(conf, env); Job job = new Job(conf, jobName); String numMappers = getNumberOfMaps(); if (!DistributedJobConfig.isEmpty(numMappers)) { numMappers = environmentSubstitute(numMappers, env); ((JobConf) job.getConfiguration()).setNumMapTasks(Integer.parseInt(numMappers)); } // The number of map tasks that will be run simultaneously by a task tracker String maxConcurrentMapTasks = getTaskTrackerMapTasksMaximum(); if (!DistributedJobConfig.isEmpty(maxConcurrentMapTasks)) { ((JobConf) job.getConfiguration()).set("mapred.tasktracker.map.tasks.maximum", maxConcurrentMapTasks); } String numReducers = getNumberOfReducers(); if (!DistributedJobConfig.isEmpty(numReducers)) { numReducers = environmentSubstitute(numReducers, env); job.setNumReduceTasks(Integer.parseInt(numReducers)); if (Integer.parseInt(numReducers) == 0) { System.err.println("Warning - no reducer class set. Configuring for a map only job"); } } else { job.setNumReduceTasks(1); } String mapperClass = getMapperClass(); if (DistributedJobConfig.isEmpty(mapperClass)) { throw new IOException("No mapper class specified!"); } mapperClass = environmentSubstitute(mapperClass, env); @SuppressWarnings("unchecked") Class<? extends Mapper> mc = (Class<? extends Mapper>) Class.forName(mapperClass); job.setMapperClass(mc); String reducerClass = getReducerClass(); if (DistributedJobConfig.isEmpty(reducerClass) && Integer.parseInt(numReducers) > 0) { throw new IOException("No reducer class specified!"); } else if (job.getNumReduceTasks() > 0) { reducerClass = environmentSubstitute(reducerClass, env); @SuppressWarnings("unchecked") Class<? extends Reducer> rc = (Class<? extends Reducer>) Class.forName(reducerClass); job.setReducerClass(rc); } String combinerClass = getCombinerClass(); if (!DistributedJobConfig.isEmpty(combinerClass)) { combinerClass = environmentSubstitute(combinerClass, env); @SuppressWarnings("unchecked") Class<? extends Reducer> cc = (Class<? extends Reducer>) Class.forName(combinerClass); job.setCombinerClass(cc); } String inputFormatClass = getInputFormatClass(); if (DistributedJobConfig.isEmpty(inputFormatClass)) { throw new IOException("No input format class specified"); } inputFormatClass = environmentSubstitute(inputFormatClass, env); @SuppressWarnings("unchecked") Class<? extends InputFormat> ifc = (Class<? extends InputFormat>) Class.forName(inputFormatClass); job.setInputFormatClass(ifc); String outputFormatClass = getOutputFormatClass(); if (DistributedJobConfig.isEmpty(outputFormatClass)) { throw new IOException("No output format class specified"); } outputFormatClass = environmentSubstitute(outputFormatClass, env); @SuppressWarnings("unchecked") Class<? extends OutputFormat> ofc = (Class<? extends OutputFormat>) Class.forName(outputFormatClass); job.setOutputFormatClass(ofc); String mapOutputKeyClass = getMapOutputKeyClass(); if (DistributedJobConfig.isEmpty(mapOutputKeyClass)) { throw new IOException("No map output key class defined"); } mapOutputKeyClass = environmentSubstitute(mapOutputKeyClass, env); Class mokc = Class.forName(mapOutputKeyClass); job.setMapOutputKeyClass(mokc); String mapOutputValueClass = getMapOutputValueClass(); if (DistributedJobConfig.isEmpty(mapOutputValueClass)) { throw new IOException("No map output value class defined"); } mapOutputValueClass = environmentSubstitute(mapOutputValueClass, env); Class movc = Class.forName(mapOutputValueClass); job.setMapOutputValueClass(movc); String outputKeyClass = getOutputKeyClass(); if (DistributedJobConfig.isEmpty(outputKeyClass)) { throw new IOException("No output key class defined"); } outputKeyClass = environmentSubstitute(outputKeyClass, env); Class okc = Class.forName(outputKeyClass); job.setOutputKeyClass(okc); String outputValueClass = getOutputValueClass(); if (DistributedJobConfig.isEmpty(outputValueClass)) { throw new IOException("No output value class defined"); } outputValueClass = environmentSubstitute(outputValueClass, env); Class ovc = Class.forName(outputValueClass); job.setOutputValueClass(ovc); String inputPaths = getInputPaths(); // don't complain if there aren't any as inputs such as HBASE // require other properties to be set if (!DistributedJobConfig.isEmpty(inputPaths)) { inputPaths = environmentSubstitute(inputPaths, env); FileInputFormat.setInputPaths(job, inputPaths); } String outputPath = getOutputPath(); if (DistributedJobConfig.isEmpty(outputPath)) { throw new IOException("No output path specified"); } outputPath = environmentSubstitute(outputPath, env); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job; }
From source file:dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java
License:Apache License
public int run(String[] args) throws Exception { Configuration configuration = getConf(); Job job = new Job(configuration, "ARC Header Extractor"); job.setJarByClass(ARCHeaderExtractorMR.class); job.setMapperClass(ARCHeaderExtractorMapper.class); job.setCombinerClass(ARCHeaderExtractorReducer.class); job.setReducerClass(ARCHeaderExtractorReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); int n = args.length; if (n == 0 || n > 2) { System.err.println(//from w w w . j a v a 2s. co m "Not enough arguments. input dir and output dir mandatory. Only " + n + " were supplied."); System.exit(0); } SequenceFileInputFormat.addInputPath(job, new Path(args[0])); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : -1; }
From source file:drdoobs.AggregateJob.java
public int run(String[] args) throws Exception { Job job = new Job(getConf()); job.setJarByClass(getClass());/*from w ww . ja va 2 s . c o m*/ job.setJobName(getClass().getSimpleName()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(ProjectionMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:dz.lab.mapred.counter.StartsWithCountJob_PrintCounters.java
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // the following property will enable mapreduce to use its packaged local job runner //conf.set("mapreduce.framework.name", "local"); Job job = Job.getInstance(conf, "StartsWithCountJob"); job.setJarByClass(getClass());/* w w w . j av a2 s . c o m*/ // configure output and input source TextInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(TextInputFormat.class); // configure mapper and reducer job.setMapperClass(StartsWithCountMapper.class); job.setCombinerClass(StartsWithCountReducer.class); job.setReducerClass(StartsWithCountReducer.class); // configure output TextOutputFormat.setOutputPath(job, new Path(args[1])); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); int resultCode = job.waitForCompletion(true) ? 0 : 1; System.out.println("Job is complete! Printing Counters:"); Counters counters = job.getCounters(); for (String groupName : counters.getGroupNames()) { CounterGroup group = counters.getGroup(groupName); System.out.println(group.getDisplayName()); for (Counter counter : group.getUnderlyingGroup()) { System.out.println(" " + counter.getDisplayName() + "=" + counter.getValue()); } } return resultCode; }
From source file:dz.lab.mapred.exclude.StartsWithCountJob_DistCacheAPI.java
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // the following property will enable mapreduce to use its packaged local job runner //conf.set("mapreduce.framework.name", "local"); Job job = Job.getInstance(conf, "StartsWithCountJob"); job.setJarByClass(getClass());/*from w w w. j a v a2s. c o m*/ // configure output and input source TextInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(TextInputFormat.class); // configure mapper and reducer job.setMapperClass(StartsWithCountMapper.class); job.setCombinerClass(StartsWithCountReducer.class); job.setReducerClass(StartsWithCountReducer.class); // configure output TextOutputFormat.setOutputPath(job, new Path(args[1])); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); Path toCache = new Path("/training/data/startWithExcludeFile.txt"); // add file to cache job.addCacheFile(toCache.toUri()); // create symbolic links for all files in DistributedCache; without the links you would have to use fully qualified path job.createSymlink(); return job.waitForCompletion(true) ? 0 : 1; }
From source file:dz.lab.mapred.hbase.custom_input.StartsWithCountJob_HBaseInput.java
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf(), "StartsWithCount-FromHBase"); job.setJarByClass(getClass());/*from w w w.j a v a 2 s . c o m*/ // set HBase InputFormat job.setInputFormatClass(TableInputFormat.class); // new mapper to handle data from HBase job.setMapperClass(StartsWithCountMapper_HBase.class); // add hbase configuration Configuration conf = job.getConfiguration(); HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); TableMapReduceUtil.addDependencyJars(job); // specify table and column to read from conf.set(TableInputFormat.INPUT_TABLE, TABLE_NAME); conf.set(TableInputFormat.SCAN_COLUMNS, "count:word"); // configure mapper and reducer job.setCombinerClass(StartsWithCountReducer.class); job.setReducerClass(StartsWithCountReducer.class); // configure output TextOutputFormat.setOutputPath(job, new Path(args[0])); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); return job.waitForCompletion(true) ? 0 : 1; }