List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = JobBuilder.parseInputAndOutput(this, getConf(), args); if (job == null) { return -1; }/*www . j a v a2s . com*/ job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); InputSampler.writePartitionFile(job, sampler); // Add to DistributedCache Configuration conf = job.getConfiguration(); String partitionFile = TotalOrderPartitioner.getPartitionFile(conf); URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); return job.waitForCompletion(true) ? 0 : 1; }
From source file:cs6240.project.decisiontree.Pseudohigstest.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); DistributedCache.addCacheFile(new URI("s3://hr6240/higs/testing/5/higshistogram"), conf); // DistributedCache.addCacheFile(new // URI("/home/hraj17/Downloads/part-hig"),conf); Job job = new Job(conf, "word count"); job.setJarByClass(Pseudohigstest.class); job.setMapperClass(TestingMapper.class); job.setReducerClass(TestingReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setPartitionerClass(TestingPartioner.class); job.setNumReduceTasks(2);//from w ww. j a v a2s .c om FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:cs6240.project.decisiontree.Pseudotestingtwitter.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); DistributedCache.addCacheFile(new URI("s3://hr6240/histogram/5/metadata5"), conf); Job job = new Job(conf, "word count"); job.setJarByClass(Pseudotestingtwitter.class); job.setMapperClass(TestingMapper.class); job.setReducerClass(TestingReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setPartitionerClass(TestingPartioner.class); job.setNumReduceTasks(2);/* w w w. j av a2 s . c o m*/ FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:csc555.ebratt.depaul.edu.RCTop10Driver.java
License:Open Source License
/** * /*from w w w . java 2s . c om*/ * Runs the driver by creating a new hadoop Job based on the configuration. * Defines the path in/out based on the first two arguments. * * @param args * [0] the input directory on HDFS * @param args * [1] the output directory on HDFS * @throws Exception * if there is an issue with any of the arguments * */ public int run(String[] args) throws Exception { Job job = new Job(getConf(), "Top 10 Reddit"); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); // ensure 1 reduce tasks for ranking job.setNumReduceTasks(1); // Mapper and Reducer Classes to use job.setMapperClass(RCTop10Mapper.class); job.setReducerClass(RCTop10Reducer.class); // Mapper output classes job.setMapOutputKeyClass(GroupByCountPair.class); job.setMapOutputValueClass(Text.class); // set custom partitioner job.setPartitionerClass(GroupByCountPairPartitioner.class); // set custom grouping comparator job.setGroupingComparatorClass(GroupByGroupingComparator.class); // input class job.setInputFormatClass(KeyValueTextInputFormat.class); // Reducer output classes job.setOutputKeyClass(GroupByCountPair.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); // The Jar file to run job.setJarByClass(RCTop10Driver.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); return 0; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams/*from w w w . ja v a 2 s.co m*/ */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); con.set(WINDOW_TYPE, mode.toString()); con.setInt(WINDOW_SIZE, winsize); if (mode.toString().equalsIgnoreCase("DOCUMENT")) { con.setInt("mapred.job.map.memory.mb", 3000); con.set("mapred.child.java.opts", "-Xmx2900M"); con.set("mapred.reduce.child.java.opts", "-Xmx8000M"); con.setInt("mapred.job.reduce.memory.mb", 8120); } else { con.setInt("mapred.job.map.memory.mb", 2000); con.set("mapred.child.java.opts", "-Xmx1900M"); con.set("mapred.reduce.child.java.opts", "-Xmx2900M"); con.setInt("mapred.job.reduce.memory.mb", 3000); } con.setBoolean("mapred.compress.map.output", true); con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setBoolean("mapred.compress.output", true); con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setInt("mapred.task.timeout", 6000000); con.setInt("io.sort.factor", 50); con.setInt("mapreduce.map.tasks", 256); con.setInt("dfs.replication", 1); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(512); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase4RemoveDuplicatesUsingReduceSideJoins.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class); job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName()); // paths//from w w w. ja v a 2 s .c o m // text files of ids to be deleted String textFilePath = args[0]; // corpus with *.warc.gz String commaSeparatedInputFiles = args[1]; // output String outputPath = args[2]; //second input the look up text file MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class, JoinTextMapper.class); //first input the data set (check comma separated availability) MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class, JoinWARCMapper.class); job.setPartitionerClass(SourceJoiningKeyPartitioner.class); job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(WARCWritable.class); job.setReducerClass(JoinReducer.class); job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:demo.SsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SsJob.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapOutputKeyClass(StockKey.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SsMapper.class); job.setReducerClass(SsReducer.class); job.waitForCompletion(true);/* w ww . j a v a 2 s. c o m*/ return 0; }
From source file:edu.buffalo.cse.dic.mapreduce.WordCount.java
License:Apache License
@Override public Map<String, Number> start(String inputFile) { try {/*from www . ja v a 2s. c o m*/ LinkedHashMap<String, Number> topTen = new LinkedHashMap<>(); Configuration conf = new Configuration(); conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/core-site.xml")); conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/hdfs-site.xml")); FileSystem fs = FileSystem.get(new URI("wordcount"), conf); fs.delete(new Path("wordcount")); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(inputFile)); FileOutputFormat.setOutputPath(job, new Path("wordcount")); job.waitForCompletion(true); System.out.println("word count done"); FileSystem fsa = FileSystem.get(new URI("wordcount"), conf); fsa.delete(new Path("wordcountfinal")); Job sortJob = new Job(conf, "sort reducer"); sortJob.setJarByClass(SortReducerOutput.class); sortJob.setMapperClass(OutputBreaker.class); sortJob.setSortComparatorClass(ReverseComparator.class); sortJob.setReducerClass(SortByCount.class); sortJob.setOutputKeyClass(IntWritable.class); sortJob.setOutputValueClass(Text.class); sortJob.setPartitionerClass(TotalOrderPartitioner.class); Path partitionFile = new Path("trendcount", "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(sortJob.getConfiguration(), partitionFile); FileInputFormat.addInputPath(sortJob, new Path("wordcount/part-r-00000")); FileOutputFormat.setOutputPath(sortJob, new Path("wordcountfinal")); sortJob.waitForCompletion(true); System.out.println("sort word count"); Path output = new Path("wordcountfinal/part-r-00000"); FileSystem fileSystem = FileSystem.get(output.toUri(), conf); FileStatus[] items = fileSystem.listStatus(output); for (FileStatus item : items) { InputStream stream = null; // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; } else { stream = fileSystem.open(item.getPath()); } Scanner scan = new Scanner(stream).useDelimiter("\\n"); for (int i = 0; i < 10; i++) { if (scan.hasNext()) { String data = scan.next(); topTen.put(data.split("\\t")[1], Integer.parseInt(data.split("\\t")[0])); } } } return topTen; } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } return null; }
From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass", conf);// www . ja v a 2 s. co m String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf); sLogger.info("Tool name: HarvestContextPatternPairs"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestContextPatternPairs"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestParaphraseCandidates.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorClass", conf);/* w ww . jav a 2 s .c om*/ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorArgs", conf); String numResults = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.NumResults", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.OutputPath", conf); MavunoUtils.createDirectory(conf, outputPath); sLogger.info("Tool name: HarvestParaphraseCandidates"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestParaphraseCandidates"); // harvest all (context, pattern) triples conf.set("Mavuno.HarvestContextPatternPairs.CorpusPath", corpusPath); conf.set("Mavuno.HarvestContextPatternPairs.CorpusClass", corpusClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorClass", extractorClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorArgs", extractorArgs); conf.set("Mavuno.HarvestContextPatternPairs.MinMatches", minMatches); conf.set("Mavuno.HarvestContextPatternPairs.OutputPath", outputPath + "/triples"); new HarvestContextPatternPairs(conf).run(); FileInputFormat.addInputPath(job, new Path(outputPath + "/triples")); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/patterns-all")); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); job.setMapOutputValueClass(TextLongPairWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); // combine scores // conf.set("Mavuno.CombineScores.InputPath", outputPath + "/patterns-all"); // conf.set("Mavuno.CombineScores.OutputPath", outputPath + "/patterns"); // new CombineScores(conf).run(); // // only retain the top paraphrases conf.set("Mavuno.GetTopResults.InputPath", outputPath + "/patterns-all"); conf.set("Mavuno.GetTopResults.OutputPath", outputPath + "/top-k"); conf.set("Mavuno.GetTopResults.NumResults", numResults); conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false); new GetTopResults(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/patterns-all"); return 0; }