List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:hk.newsRecommender.TFIDF2.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hdfsUrl = conf.get("fs.defaultFS"); // part0---------------------------------------------------- Job job0 = Job.getInstance(conf, "sfitSingleNews"); Path output0Path = new Path(hdfsUrl + "/data/recommend/tfidf0"); HadoopUtil.delete(conf, output0Path); job0.setJarByClass(TFIDF.class); job0.setMapperClass(Mapper_Part0.class); // job1.setCombinerClass(Combiner_Part1.class); // combiner? job0.setReducerClass(Reduce_Part0.class); job0.setMapOutputKeyClass(Text.class); job0.setMapOutputValueClass(Text.class); job0.setOutputKeyClass(Text.class); job0.setOutputValueClass(Text.class); // job1.setNumReduceTasks(p.length); FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/data2.txt")); FileOutputFormat.setOutputPath(job0, output0Path); job0.waitForCompletion(true);// w w w .jav a2s . c om // part1---------------------------------------------------- Job job1 = Job.getInstance(conf, "computeTF"); Path outputPath1 = new Path(hdfsUrl + "/data/recommend/tfidf1"); HadoopUtil.delete(conf, outputPath1); job1.setJarByClass(TFIDF.class); job1.setMapperClass(Mapper_Part1.class); job1.setReducerClass(Reduce_Part1.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(Text.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(Text.class); job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf0")); FileOutputFormat.setOutputPath(job1, outputPath1); job1.waitForCompletion(true); // part2---------------------------------------- Job job2 = Job.getInstance(conf, "computeTFIDF"); Path outputPath2 = new Path(hdfsUrl + "/data/recommend/tfidf2"); HadoopUtil.delete(conf, outputPath2); job2.setJarByClass(TFIDF.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(Text.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); job2.setMapperClass(Mapper_Part2.class); job2.setReducerClass(Reduce_Part2.class); FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/tfidf1")); FileOutputFormat.setOutputPath(job2, outputPath2); job2.waitForCompletion(true); // part3---------------------------------------- Configuration conf3 = new Configuration(); Path outputPath3 = new Path(hdfsUrl + "/data/recommend/tfidf3"); HadoopUtil.delete(conf, outputPath3); Job job3 = Job.getInstance(conf3, "My_tdif_part3"); job3.setMapperClass(Mapper_Part3.class); job3.setReducerClass(Reduce_Part3.class); job3.setMapOutputKeyClass(CustomKey.class); job3.setMapOutputValueClass(NullWritable.class); job3.setOutputKeyClass(CustomKey.class); job3.setOutputValueClass(NullWritable.class); job3.setGroupingComparatorClass(CustomGroupComparator.class); job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/tfidf2")); FileOutputFormat.setOutputPath(job3, outputPath3); job3.waitForCompletion(true); }
From source file:hk.newsRecommender.TFIDFClassify.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hdfsUrl = conf.get("fs.defaultFS"); // part1---------------------------------------------------- Job job1 = Job.getInstance(conf, "computeTF"); Path outputPath1 = new Path(hdfsUrl + "/data/recommend/class1/tfidf1"); HadoopUtil.delete(conf, outputPath1); job1.setJarByClass(TFIDFClassify.class); job1.setMapperClass(Mapper_Part1.class); job1.setReducerClass(Reduce_Part1.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(Text.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(Text.class); job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/data3.txt")); FileOutputFormat.setOutputPath(job1, outputPath1); job1.waitForCompletion(true);//from ww w . ja v a 2 s .c o m // part2---------------------------------------- Job job2 = Job.getInstance(conf, "computIDF"); Path outputPath2 = new Path(hdfsUrl + "/data/recommend/class1/tfidf2"); HadoopUtil.delete(conf, outputPath2); job2.setJarByClass(TFIDFClassify.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(Text.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); job2.setMapperClass(Mapper_Part2.class); job2.setReducerClass(Reduce_Part2.class); FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/class1/tfidf1")); FileOutputFormat.setOutputPath(job2, outputPath2); job2.waitForCompletion(true); // part3---------------------------------------- Job job3 = Job.getInstance(conf, "sortByTFIDFDec"); Path outputPath3 = new Path(hdfsUrl + "/data/recommend/class1/tfidf3"); HadoopUtil.delete(conf, outputPath3); job3.setMapperClass(Mapper_Part3.class); job3.setReducerClass(Reduce_Part3.class); job3.setMapOutputKeyClass(CustomKey.class); job3.setMapOutputValueClass(NullWritable.class); job3.setOutputKeyClass(CustomKey.class); job3.setOutputValueClass(NullWritable.class); job3.setGroupingComparatorClass(CustomGroupComparator.class); job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/class1/tfidf2")); FileOutputFormat.setOutputPath(job3, outputPath3); job3.waitForCompletion(true); // part4---------------??------------------------- // Job job4 = Job.getInstance(conf, "siftKeywords"); // Path outputPath4=new Path(hdfsUrl + "/data/recommend/class1/matrix1"); // HadoopUtil.delete(conf, outputPath4); // job4.setJarByClass(TFIDF.class); // job4.setMapperClass(Mapper_Part4.class); // job4.setReducerClass(Reduce_Part4.class); // job4.setMapOutputKeyClass(Text.class); // job4.setMapOutputValueClass(Text.class); // job4.setOutputKeyClass(Text.class); // job4.setOutputValueClass(Text.class); // job4.setPartitionerClass(CustomPartitioner.class); // FileInputFormat.addInputPath(job4, new Path(hdfsUrl + "/data/recommend/class1/tfidf3")); // FileOutputFormat.setOutputPath(job4, outputPath4); // job4.waitForCompletion(true); // part5---------------------------------------- FileSystem fsopen = FileSystem.get(conf); FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000")); Scanner scan = new Scanner(in); List<String> keywordList = new ArrayList<String>(); while (scan.hasNext()) { keywordList.add(scan.next()); } // must before job conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()])); Job job5 = Job.getInstance(conf, "generateMatrix"); Path outputPath5 = new Path(hdfsUrl + "/data/recommend/class1/matrix2"); HadoopUtil.delete(conf, outputPath5); job5.setJarByClass(TFIDF.class); job5.setMapperClass(Mapper_Part5.class); job5.setReducerClass(Reduce_Part5.class); job5.setMapOutputKeyClass(Text.class); job5.setMapOutputValueClass(Text.class); job5.setOutputKeyClass(Text.class); job5.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job5, new Path(hdfsUrl + "/data/recommend/class1/tfidf3")); FileOutputFormat.setOutputPath(job5, outputPath5); job5.waitForCompletion(true); }
From source file:hr.fer.tel.rovkp.homework02.task02.Program.java
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: <jar> <input path> <output path>"); return;// w w w .j a v a2 s . c om } Job job = Job.getInstance(); job.setJarByClass(Program.class); job.setJobName("Locations"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(LocationsMapper.class); job.setPartitionerClass(LocationsPartitioner.class); job.setReducerClass(LocationsReducer.class); job.setNumReduceTasks(6); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class, NullWritable.class, Text.class); job.waitForCompletion(true); }
From source file:hr.fer.tel.rovkp.homework02.task03.Program.java
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: <jar> <input path> <output path>"); return;/*w ww . jav a 2s.com*/ } Job firstJob = Job.getInstance(); firstJob.setJarByClass(Program.class); firstJob.setJobName("Locations"); FileInputFormat.addInputPath(firstJob, new Path(args[0])); FileOutputFormat.setOutputPath(firstJob, new Path(INTERMEDIATE_PATH)); firstJob.setMapperClass(LocationsMapper.class); firstJob.setPartitionerClass(LocationsPartitioner.class); firstJob.setReducerClass(LocationsReducer.class); firstJob.setNumReduceTasks(6); firstJob.setOutputKeyClass(IntWritable.class); firstJob.setOutputValueClass(Text.class); MultipleOutputs.addNamedOutput(firstJob, "bins", TextOutputFormat.class, NullWritable.class, Text.class); int code = firstJob.waitForCompletion(true) ? 0 : 1; System.out.println("First job return code: " + code); if (code == 0) { Job job1 = run(INTERMEDIATE_PATH + "center1", args[1] + "/1"); Job job2 = run(INTERMEDIATE_PATH + "not_center1", args[1] + "/2"); Job job3 = run(INTERMEDIATE_PATH + "center2", args[1] + "/3"); Job job4 = run(INTERMEDIATE_PATH + "not_center2", args[1] + "/4"); Job job5 = run(INTERMEDIATE_PATH + "center4", args[1] + "/5"); Job job6 = run(INTERMEDIATE_PATH + "not_center4", args[1] + "/6"); while (!(job1.isComplete() && job2.isComplete() && job3.isComplete() && job4.isComplete() && job5.isComplete() && job6.isComplete())) { Thread.sleep(2000); } } FileSystem.get(firstJob.getConfiguration()).delete(new Path(INTERMEDIATE_PATH), true); }
From source file:hw1.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);//from ww w . ja v a2s .c o m } Job job = new Job(conf, "word count"); /* * The line below can set the amount of reduce tasks to a specific * number, which equals to the amount of output 'part-r-...' files */ // job.setNumReduceTasks(5); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); // job.setCombinerClass(IntSumReducer.class); job.setPartitionerClass(CustomPartitioner.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:info.halo9pan.word2vec.hadoop.mr.WordSort.java
License:Apache License
public int run(String[] args) throws Exception { logger.info("starting"); Job job = Job.getInstance(getConf()); Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); boolean useSimplePartitioner = getUseSimplePartitioner(job); SortInputFormat.setInputPaths(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("WordSort"); job.setJarByClass(WordSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SortInputFormat.class); job.setOutputFormatClass(SortOutputFormat.class); if (useSimplePartitioner) { job.setPartitionerClass(SimplePartitioner.class); } else {//from w w w.ja va 2s . c o m long start = System.currentTimeMillis(); Path partitionFile = new Path(outputDir, SortInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + SortInputFormat.PARTITION_FILENAME); try { SortInputFormat.writePartitionFile(job, partitionFile); } catch (Throwable e) { logger.error(e.getMessage()); return -1; } job.addCacheFile(partitionUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions."); job.setPartitionerClass(TotalOrderPartitioner.class); } job.getConfiguration().setInt("dfs.replication", getOutputReplication(job)); SortOutputFormat.setFinalSync(job, true); int ret = job.waitForCompletion(true) ? 0 : 1; logger.info("done"); return ret; }
From source file:io.apigee.lembos.mapreduce.LembosMapReduceRunner.java
License:Apache License
/** * Returns a properly configured, ready to run Hadoop {@link Job}. * * @param args the command line arguments as supported by {@link GenericOptionsParser} * * @return the configured job/*from w w w. j a va 2 s . co m*/ * * @throws IOException if there is a problem creating the job * @throws ExecutionException if there is an issue running the Node.js module * @throws InterruptedException if the execution of the Node.js module gets interrupted * @throws NodeException if there is an issue with the Node.js module */ public Job initJob(final String[] args) throws ExecutionException, InterruptedException, IOException, NodeException { final GenericOptionsParser gop = new GenericOptionsParser(args); // If ran from ToolRunner, conf should already be set but if not, set it manually if (conf == null) { setConf(gop.getConfiguration()); } // Load the Hadoop FS URL handler RunnerUtils.loadFsUrlStreamHandler(getConf()); // Persist the non-Runner CLI arguments conf.setStrings(LembosConstants.MR_MODULE_ARGS, gop.getRemainingArgs()); // Package the Node.js module and prepare it to be submitted with the Job RunnerUtils.prepareModuleForJob(conf); // Add "-libjars" to the current ClassLoader if necessary RunnerUtils.addLibJarsToClassLoader(conf); // Create Node.js environment for local use mrEnv = LembosMapReduceEnvironment.fromConf(conf); if (JavaScriptUtils.isDefined(mrEnv.getConfiguration())) { for (final Map.Entry<Object, Object> propertyEntry : mrEnv.getConfiguration().entrySet()) { final String key = propertyEntry.getKey().toString(); final Writable value = ConversionUtils.jsToWritable(propertyEntry.getValue(), mrEnv.getModule()); // Do not set these as we'll be setting them later from values we were passed from the CLI if (key.equals(LembosConstants.MR_MODULE_NAME)) { continue; } if (value instanceof BooleanWritable) { conf.setBoolean(key, ((BooleanWritable) value).get()); } else if (value instanceof DoubleWritable || value instanceof FloatWritable) { conf.setFloat(key, Float.valueOf(value.toString())); } else if (value instanceof IntWritable) { conf.setInt(key, ((IntWritable) value).get()); } else if (value instanceof LongWritable) { conf.setLong(key, ((LongWritable) value).get()); } else if (value instanceof Text) { conf.set(key, value.toString()); } else { System.err.println("Cannot convert JavaScript (" + value.getClass().getName() + ") to Configuration, using String"); conf.set(key, value.toString()); } } } // Create Job final String jobName = "LembosMapReduceJob-" + mrEnv.getModuleName(); final Job job = new Job(conf, jobName); jobWrapper = JobWrap.getInstance(mrEnv.getRuntime(), job); if (JavaScriptUtils.isDefined(mrEnv.getJobSetupFunction())) { mrEnv.callFunctionSync(mrEnv.getJobSetupFunction(), new Object[] { jobWrapper }); } // Always set the mapper job.setMapperClass(LembosMapper.class); // Conditionally set the combiner if (JavaScriptUtils.isDefined(mrEnv.getCombineFunction())) { job.setCombinerClass(LembosCombiner.class); } // Conditionally set the group comparator if (JavaScriptUtils.isDefined(mrEnv.getGroupFunction())) { job.setGroupingComparatorClass(LembosGroupComparator.class); } // Conditionally set the partitioner if (JavaScriptUtils.isDefined(mrEnv.getPartitionFunction())) { job.setPartitionerClass(LembosPartitioner.class); } // Conditionally set the reducer if (JavaScriptUtils.isDefined(mrEnv.getReduceFunction())) { job.setReducerClass(LembosReducer.class); } else { job.setNumReduceTasks(0); } // Conditionally set the sort comparator if (JavaScriptUtils.isDefined(mrEnv.getSortFunction())) { job.setSortComparatorClass(LembosSortComparator.class); } // This could potentially be unsafe but for testing, we need to set this based on the path to the built JAR if (job.getJar() == null) { job.setJarByClass(LembosMapReduceRunner.class); } // MapReduce configuration reference: // // http://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // org.apache.hadoop.mapreduce.MRConfig // org.apache.hadoop.mapreduce.MRJobConfig return job; }
From source file:io.bfscan.clueweb12.LMRetrieval.java
License:Apache License
/** * Runs this tool.//from ww w .ja v a 2 s . c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("input path (pfor format expected, add * to retrieve files)") .create(DOCVECTOR_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("queries").create(QUERIES_OPTION)); options.addOption( OptionBuilder.withArgName("float").hasArg().withDescription("smoothing").create(SMOOTHING)); options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK)); options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg() .withDescription("preprocessing").create(PREPROCESSING)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(SMOOTHING) || !cmdline.hasOption(TOPK) || !cmdline.hasOption(PREPROCESSING)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION); String queries = cmdline.getOptionValue(QUERIES_OPTION); String smoothing = cmdline.getOptionValue(SMOOTHING); String topk = cmdline.getOptionValue(TOPK); String preprocessing = cmdline.getOptionValue(PREPROCESSING); LOG.info("Tool name: " + LMRetrieval.class.getSimpleName()); LOG.info(" - docvector: " + docvector); LOG.info(" - output: " + output); LOG.info(" - dictionary: " + dictionary); LOG.info(" - queries: " + queries); LOG.info(" - smoothing: " + smoothing); LOG.info(" - topk: " + topk); LOG.info(" - preprocessing: " + preprocessing); Configuration conf = getConf(); conf.set(DICTIONARY_OPTION, dictionary); conf.set(QUERIES_OPTION, queries); conf.setFloat(SMOOTHING, Float.parseFloat(smoothing)); conf.setInt(TOPK, Integer.parseInt(topk)); conf.set(PREPROCESSING, preprocessing); conf.set("mapreduce.map.memory.mb", "10048"); conf.set("mapreduce.map.java.opts", "-Xmx10048m"); conf.set("mapreduce.reduce.memory.mb", "10048"); conf.set("mapreduce.reduce.java.opts", "-Xmx10048m"); conf.set("mapred.task.timeout", "6000000"); // default is 600000 FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(output))) { fs.delete(new Path(output), true); } Job job = new Job(conf, LMRetrieval.class.getSimpleName() + ":" + docvector); job.setJarByClass(LMRetrieval.class); FileInputFormat.setInputPaths(job, docvector); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(PairOfIntString.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setPartitionerClass(MyPartitioner.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:io.druid.indexer.DetermineHashedPartitionsJob.java
License:Apache License
public boolean run() { try {/*w w w . j a va2s . c o m*/ /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ long startTime = System.currentTimeMillis(); final Job groupByJob = Job.getInstance(new Configuration(), String .format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals())); JobHelper.injectSystemProperties(groupByJob); config.addJobProperties(groupByJob); groupByJob.setMapperClass(DetermineCardinalityMapper.class); groupByJob.setMapOutputKeyClass(LongWritable.class); groupByJob.setMapOutputValueClass(BytesWritable.class); groupByJob.setReducerClass(DetermineCardinalityReducer.class); groupByJob.setOutputKeyClass(NullWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class); if (!config.getSegmentGranularIntervals().isPresent()) { groupByJob.setNumReduceTasks(1); } else { groupByJob.setNumReduceTasks(config.getSegmentGranularIntervals().get().size()); } JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } /* * Load partitions and intervals determined by the previous job. */ log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; if (!config.getSegmentGranularIntervals().isPresent()) { final Path intervalInfoPath = config.makeIntervalInfoPath(); fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration()); if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) { throw new ISE("Path[%s] didn't exist!?", intervalInfoPath); } List<Interval> intervals = config.jsonMapper.readValue( Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() { }); config.setGranularitySpec( new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(), config.getGranularitySpec().getQueryGranularity(), intervals)); log.info("Determined Intervals for Job [%s]" + config.getSegmentGranularIntervals()); } Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) { DateTime bucket = segmentGranularity.getStart(); final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration()); } if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) { final Long numRows = config.jsonMapper.readValue( Utils.openInputStream(groupByJob, partitionInfoPath), new TypeReference<Long>() { }); log.info("Found approximately [%,d] rows in data.", numRows); final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize()); log.info("Creating [%,d] shards", numberOfShards); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards); if (numberOfShards == 1) { actualSpecs.add(new HadoopyShardSpec(new NoneShardSpec(), shardCount++)); } else { for (int i = 0; i < numberOfShards; ++i) { actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards, HadoopDruidIndexerConfig.jsonMapper), shardCount++)); log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i)); } } shardSpecs.put(bucket, actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime)); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
From source file:io.druid.indexer.DeterminePartitionsJob.java
License:Apache License
public boolean run() { try {/*from w w w.j a va2 s. c om*/ /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) { throw new ISE( "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec()); } if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = Job.getInstance(new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); JobHelper.injectSystemProperties(groupByJob); config.addJobProperties(groupByJob); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); JobHelper.injectSystemProperties(dimSelectionJob); config.addJobProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class); dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size()); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) { final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) { List<ShardSpec> specs = config.jsonMapper.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() { }); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i)); } shardSpecs.put(segmentGranularity.getStart(), actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }