List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:ece465.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Map<String, String> env = System.getenv(); Path coreSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/core-site.xml"); Path hdfsSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/hdfs-site.xml"); Path yarnSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/yarn-site.xml"); Path mapredSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/mapred-site.xml"); conf.addResource(coreSiteXml);/*from w ww. j av a2 s . c o m*/ conf.addResource(hdfsSiteXml); conf.addResource(yarnSiteXml); conf.addResource(mapredSiteXml); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); Path inputPath = new Path(otherArgs[0]); System.out.println(inputPath); Path outputPath = new Path(otherArgs[1]); System.out.println(outputPath); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:edu.bigdata.training.core.mapreduce.WordCount.java
public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException { {/*from w w w .ja va 2 s .co m*/ System.out.println("arg[0]-->" + args[0]); System.out.println("arg[1]-->" + args[1]); Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(SimpleMapper.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); System.out.println("Total Words:" + job.getCounters().findCounter(METRICS.TOTAL_WORDS).getValue()); } }
From source file:edu.bigdata.training.mrcassandra.MapReduceExample.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "MR Keying"); job.setJarByClass(MapReduceExample.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path("/user/root/input/all-shakespeare.txt")); FileOutputFormat.setOutputPath(job, new Path("/user/root/output/")); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:edu.buffalo.cse.dic.mapreduce.WordCount.java
License:Apache License
@Override public Map<String, Number> start(String inputFile) { try {//from w w w . j ava 2 s. co m LinkedHashMap<String, Number> topTen = new LinkedHashMap<>(); Configuration conf = new Configuration(); conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/core-site.xml")); conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/hdfs-site.xml")); FileSystem fs = FileSystem.get(new URI("wordcount"), conf); fs.delete(new Path("wordcount")); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(inputFile)); FileOutputFormat.setOutputPath(job, new Path("wordcount")); job.waitForCompletion(true); System.out.println("word count done"); FileSystem fsa = FileSystem.get(new URI("wordcount"), conf); fsa.delete(new Path("wordcountfinal")); Job sortJob = new Job(conf, "sort reducer"); sortJob.setJarByClass(SortReducerOutput.class); sortJob.setMapperClass(OutputBreaker.class); sortJob.setSortComparatorClass(ReverseComparator.class); sortJob.setReducerClass(SortByCount.class); sortJob.setOutputKeyClass(IntWritable.class); sortJob.setOutputValueClass(Text.class); sortJob.setPartitionerClass(TotalOrderPartitioner.class); Path partitionFile = new Path("trendcount", "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(sortJob.getConfiguration(), partitionFile); FileInputFormat.addInputPath(sortJob, new Path("wordcount/part-r-00000")); FileOutputFormat.setOutputPath(sortJob, new Path("wordcountfinal")); sortJob.waitForCompletion(true); System.out.println("sort word count"); Path output = new Path("wordcountfinal/part-r-00000"); FileSystem fileSystem = FileSystem.get(output.toUri(), conf); FileStatus[] items = fileSystem.listStatus(output); for (FileStatus item : items) { InputStream stream = null; // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; } else { stream = fileSystem.open(item.getPath()); } Scanner scan = new Scanner(stream).useDelimiter("\\n"); for (int i = 0; i < 10; i++) { if (scan.hasNext()) { String data = scan.next(); topTen.put(data.split("\\t")[1], Integer.parseInt(data.split("\\t")[0])); } } } return topTen; } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } return null; }
From source file:edu.columbia.hs2807.Sentiment.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "sentiment"); job.setJarByClass(Sentiment.class); job.setMapperClass(Map.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(LongArrayWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:edu.cooper.cloud.MultiFileWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();/* w w w . jav a 2s .c om*/ return 2; } Job job = new Job(getConf()); job.setJobName("MultiFileWordCount"); job.setJarByClass(MultiFileWordCount.class); //set the InputFormat of the job to our InputFormat job.setInputFormatClass(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); //use the defined mapper job.setMapperClass(MapClass.class); //use the edu.cooper.cloud.Normalize Reducer job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.cooper.cloud.Normalize.java
License:Apache License
public static void main(String[] args) throws Exception { String input = "datasets/train_subject01.csv"; String output = "output/trainX2.csv"; Configuration conf = new Configuration(); Map<String, String> env = System.getenv(); Path coreSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/core-site.xml"); Path hdfsSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/hdfs-site.xml"); Path yarnSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/yarn-site.xml"); Path mapredSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/mapred-site.xml"); conf.addResource(coreSiteXml);/*w w w . j av a 2s. c o m*/ conf.addResource(hdfsSiteXml); conf.addResource(yarnSiteXml); conf.addResource(mapredSiteXml); // String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // if (otherArgs.length != 2) { // System.err.println("Usage: wordcount <in> <out>"); // System.exit(2); // } Job job = new Job(conf, "normalize"); job.setJarByClass(Normalize.class); job.setMapperClass(NormalizeMapper.class); job.setCombinerClass(NormalizeCombiner.class); job.setReducerClass(NormalizeReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleArrayWritable.class); // job.setInputFormatClass(new FileInputFormat<IntWritable,DoubleArrayWritable>()); Path inputPath = new Path(input); System.out.println(inputPath); Path outputPath = new Path(output); System.out.println(outputPath); NLineInputFormat.addInputPath(job, inputPath); // FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); System.exit(job.waitForCompletion(true) ? 0 : 1); // Use means and std dev to normalize the data }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
/** * Run the job using supplied arguments// ww w .ja va 2s . c om * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for input clusters * @param clustersOut * the directory pathname for output clusters * @param measureClass * the classname of the DistanceMeasure * @param convergenceDelta * the convergence delta value * * @return true if the iteration successfully runs */ private static boolean runIteration(Configuration conf, Path input, Path clustersOut, String measureClass, String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException { conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass); conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta); Job job = new Job(conf, "KMeans Driver running runIteration "); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ClusterObservations.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Cluster.class); // job.setInputFormatClass(SequenceFileInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setInputFormatClass(MemIDInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapperClass(MemKMeansMapper.class); job.setCombinerClass(KMeansCombiner.class); // ?? job.setReducerClass(MemKMeansReducer.class); FileInputFormat.addInputPath(job, input); // input is id list FileOutputFormat.setOutputPath(job, clustersOut); job.setJarByClass(MemCachedKMeansDriver.class); HadoopUtil.delete(conf, clustersOut); if (!job.waitForCompletion(true)) { throw new InterruptedException("K-Means Iteration failed processing "); } return isConverged(conf); }
From source file:edu.isi.mavuno.app.ie.HarvestSAPInstances.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusClass", conf); int minMatches = Integer .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.MinMatches", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.OutputPath", conf); sLogger.info("Tool name: HarvestSAPInstances"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Minimum matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestSAPInstances"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/*from ww w. j ava 2s . c o m*/ return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass", conf);//from w w w .j av a 2 s. c o m String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf); sLogger.info("Tool name: HarvestContextPatternPairs"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestContextPatternPairs"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }