List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:InClass.ex2.HundredTimes.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2);/*from www . j a v a 2 s. c om*/ } Job job = new Job(conf, "word count"); job.setJarByClass(HundredTimes.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:info.halo9pan.word2vec.hadoop.Main.java
License:Open Source License
/** * @param args/*from w w w .j a v a 2s . c om*/ */ public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("hadoop.tmp.dir", (new Path("temp")).toUri().toString()); GenericOptionsParser optionParser = new GenericOptionsParser(conf, args); String[] remainingArgs = optionParser.getRemainingArgs(); if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) { System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]"); System.exit(2); } Path input = new Path(remainingArgs[0]); String inputName = input.getName(); Path countOutput = new Path(input.getParent(), inputName + "_count"); Job countJob = Job.getInstance(conf, "Word Count"); countJob.setJarByClass(Main.class); countJob.setMapperClass(ReadWordsMapper.class); countJob.setCombinerClass(ReadWordsReducer.class); countJob.setReducerClass(ReadWordsReducer.class); countJob.setOutputKeyClass(Text.class); countJob.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(countJob, input); FileSystem fs = FileSystem.get(conf); if (fs.exists(countOutput)) fs.delete(countOutput, true); FileOutputFormat.setOutputPath(countJob, countOutput); countJob.waitForCompletion(true); System.exit(countJob.waitForCompletion(true) ? 0 : 1); }
From source file:io.aos.mapreduce.grep.GrepTool.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); org.apache.hadoop.util.Tool t; return 2; }/*from w ww. j a va 2 s . c o m*/ Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) { conf.set(RegexMapper.GROUP, args[3]); } try { Job greJob = Job.getInstance(conf); greJob.setJobName("GrepSearch"); FileInputFormat.setInputPaths(greJob, args[0]); greJob.setMapperClass(RegexMapper.class); greJob.setCombinerClass(LongSumReducer.class); greJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(greJob, tempDir); greJob.setOutputFormatClass(SequenceFileOutputFormat.class); greJob.setOutputKeyClass(Text.class); greJob.setOutputValueClass(LongWritable.class); greJob.waitForCompletion(true); Job sortJob = Job.getInstance(conf); sortJob.setJobName("GrepSort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); // Write a single file sortJob.setNumReduceTasks(1); FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } catch (Exception e) { return 2; } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }
From source file:io.aos.t4f.hadoop.mapreduce.WordCountMapReduceTest2.java
License:Apache License
public static int main(String... args) throws Exception { // Get the default configuration object Configuration conf = new Configuration(); // Add resources conf.addResource("hdfs-default.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-default.xml"); conf.addResource("mapred-site.xml"); Job job = new Job(conf); job.setJobName("WordCount"); List<String> other_args = parseArguments(args, job); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); job.setMapperClass(MapClass.class); job.setCombinerClass(ReduceClass.class); job.setReducerClass(ReduceClass.class); // Set the input format class job.setInputFormatClass(TextInputFormat.class); // Set the output format class job.setOutputFormatClass(TextOutputFormat.class); // Set the input path TextInputFormat.setInputPaths(job, other_args.get(0)); // Set the output path TextOutputFormat.setOutputPath(job, new Path(other_args.get(1))); /*/*from w w w .j a v a 2 s . c om*/ * Set the minimum and maximum split sizes This parameter helps to * specify the number of map tasks. For each input split, there will be * a separate map task. In this example each split is of size 32 MB */ TextInputFormat.setMinInputSplitSize(job, 32 * MEGABYTES); TextInputFormat.setMaxInputSplitSize(job, 32 * MEGABYTES); // Set the jar file to run job.setJarByClass(WordCountMapReduceTest2.class); // Submit the job Date startTime = new Date(); System.out.println("Job started: " + startTime); int exitCode = job.waitForCompletion(true) ? 0 : 1; if (exitCode == 0) { Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); } else { System.out.println("Job Failed!!!"); } return exitCode; }
From source file:io.apigee.lembos.mapreduce.LembosMapReduceRunner.java
License:Apache License
/** * Returns a properly configured, ready to run Hadoop {@link Job}. * * @param args the command line arguments as supported by {@link GenericOptionsParser} * * @return the configured job// www.j a v a 2 s . co m * * @throws IOException if there is a problem creating the job * @throws ExecutionException if there is an issue running the Node.js module * @throws InterruptedException if the execution of the Node.js module gets interrupted * @throws NodeException if there is an issue with the Node.js module */ public Job initJob(final String[] args) throws ExecutionException, InterruptedException, IOException, NodeException { final GenericOptionsParser gop = new GenericOptionsParser(args); // If ran from ToolRunner, conf should already be set but if not, set it manually if (conf == null) { setConf(gop.getConfiguration()); } // Load the Hadoop FS URL handler RunnerUtils.loadFsUrlStreamHandler(getConf()); // Persist the non-Runner CLI arguments conf.setStrings(LembosConstants.MR_MODULE_ARGS, gop.getRemainingArgs()); // Package the Node.js module and prepare it to be submitted with the Job RunnerUtils.prepareModuleForJob(conf); // Add "-libjars" to the current ClassLoader if necessary RunnerUtils.addLibJarsToClassLoader(conf); // Create Node.js environment for local use mrEnv = LembosMapReduceEnvironment.fromConf(conf); if (JavaScriptUtils.isDefined(mrEnv.getConfiguration())) { for (final Map.Entry<Object, Object> propertyEntry : mrEnv.getConfiguration().entrySet()) { final String key = propertyEntry.getKey().toString(); final Writable value = ConversionUtils.jsToWritable(propertyEntry.getValue(), mrEnv.getModule()); // Do not set these as we'll be setting them later from values we were passed from the CLI if (key.equals(LembosConstants.MR_MODULE_NAME)) { continue; } if (value instanceof BooleanWritable) { conf.setBoolean(key, ((BooleanWritable) value).get()); } else if (value instanceof DoubleWritable || value instanceof FloatWritable) { conf.setFloat(key, Float.valueOf(value.toString())); } else if (value instanceof IntWritable) { conf.setInt(key, ((IntWritable) value).get()); } else if (value instanceof LongWritable) { conf.setLong(key, ((LongWritable) value).get()); } else if (value instanceof Text) { conf.set(key, value.toString()); } else { System.err.println("Cannot convert JavaScript (" + value.getClass().getName() + ") to Configuration, using String"); conf.set(key, value.toString()); } } } // Create Job final String jobName = "LembosMapReduceJob-" + mrEnv.getModuleName(); final Job job = new Job(conf, jobName); jobWrapper = JobWrap.getInstance(mrEnv.getRuntime(), job); if (JavaScriptUtils.isDefined(mrEnv.getJobSetupFunction())) { mrEnv.callFunctionSync(mrEnv.getJobSetupFunction(), new Object[] { jobWrapper }); } // Always set the mapper job.setMapperClass(LembosMapper.class); // Conditionally set the combiner if (JavaScriptUtils.isDefined(mrEnv.getCombineFunction())) { job.setCombinerClass(LembosCombiner.class); } // Conditionally set the group comparator if (JavaScriptUtils.isDefined(mrEnv.getGroupFunction())) { job.setGroupingComparatorClass(LembosGroupComparator.class); } // Conditionally set the partitioner if (JavaScriptUtils.isDefined(mrEnv.getPartitionFunction())) { job.setPartitionerClass(LembosPartitioner.class); } // Conditionally set the reducer if (JavaScriptUtils.isDefined(mrEnv.getReduceFunction())) { job.setReducerClass(LembosReducer.class); } else { job.setNumReduceTasks(0); } // Conditionally set the sort comparator if (JavaScriptUtils.isDefined(mrEnv.getSortFunction())) { job.setSortComparatorClass(LembosSortComparator.class); } // This could potentially be unsafe but for testing, we need to set this based on the path to the built JAR if (job.getJar() == null) { job.setJarByClass(LembosMapReduceRunner.class); } // MapReduce configuration reference: // // http://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // org.apache.hadoop.mapreduce.MRConfig // org.apache.hadoop.mapreduce.MRJobConfig return job; }
From source file:io.bfscan.clueweb09.ComputeTermStatistics.java
License:Apache License
/** * Runs this tool./*from www.j a v a 2s . c o m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("minimum df").create(DF_MIN_OPTION)); options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg() .withDescription("preprocessing").create(PREPROCESSING)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(PREPROCESSING)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); String preprocessing = cmdline.getOptionValue(PREPROCESSING); LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); LOG.info(" - preprocessing: " + preprocessing); getConf().set(PREPROCESSING, preprocessing); Job job = new Job(getConf(), ComputeTermStatistics.class.getSimpleName() + ":" + input); job.setJarByClass(ComputeTermStatistics.class); job.setNumReduceTasks(100); if (cmdline.hasOption(DF_MIN_OPTION)) { int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION)); LOG.info(" - dfMin: " + dfMin); job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin); } FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(ClueWeb09InputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfIntLong.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); FileSystem.get(getConf()).delete(new Path(output), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:io.bfscan.clueweb12.ComputeTermStatistics.java
License:Apache License
/** * Runs this tool.// w w w .j av a 2s . c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("minimum df").create(DF_MIN_OPTION)); options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg() .withDescription("preprocessing").create(PREPROCESSING)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(PREPROCESSING)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); String preprocessing = cmdline.getOptionValue(PREPROCESSING); LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); LOG.info(" - preprocessing: " + preprocessing); getConf().set(PREPROCESSING, preprocessing); Job job = new Job(getConf(), ComputeTermStatistics.class.getSimpleName() + ":" + input); job.setJarByClass(ComputeTermStatistics.class); job.setNumReduceTasks(100); if (cmdline.hasOption(DF_MIN_OPTION)) { int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION)); LOG.info(" - dfMin: " + dfMin); job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin); } FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(ClueWeb12InputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfIntLong.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); FileSystem.get(getConf()).delete(new Path(output), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:io.bfscan.clueweb12.MergeTermStatistics.java
License:Apache License
/** * Runs this tool.// w w w . j a v a 2s . c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("minimum df").create(DF_MIN_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + MergeTermStatistics.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); Job job = new Job(getConf(), MergeTermStatistics.class.getSimpleName() + ":" + input); job.setJarByClass(MergeTermStatistics.class); job.setNumReduceTasks(100); if (cmdline.hasOption(DF_MIN_OPTION)) { int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION)); LOG.info(" - dfMin: " + dfMin); job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin); } FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfIntLong.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); FileSystem.get(getConf()).delete(new Path(output), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:io.druid.indexer.DeterminePartitionsJob.java
License:Apache License
public boolean run() { try {//from w ww .j a v a 2 s . c om /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) { throw new ISE( "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec()); } if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = Job.getInstance(new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); JobHelper.injectSystemProperties(groupByJob); config.addJobProperties(groupByJob); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); JobHelper.injectSystemProperties(dimSelectionJob); config.addJobProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class); dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size()); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) { final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) { List<ShardSpec> specs = config.jsonMapper.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() { }); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i)); } shardSpecs.put(segmentGranularity.getStart(), actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
From source file:io.druid.indexer.IndexGeneratorJob.java
License:Apache License
public boolean run() { try {//from w ww . j a v a 2 s . c om Job job = Job.getInstance(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); JobHelper.injectSystemProperties(job); config.addJobProperties(job); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(BytesWritable.class); SortableBytes.useSortableBytesAsMapOutputKey(job); int numReducers = Iterables.size(config.getAllBuckets().get()); if (numReducers == 0) { throw new RuntimeException("No buckets?? seems there is no data to index."); } if (config.getSchema().getTuningConfig().getUseCombiner()) { job.setCombinerClass(IndexGeneratorCombiner.class); job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class); } job.setNumReduceTasks(numReducers); job.setPartitionerClass(IndexGeneratorPartitioner.class); setReducerClass(job); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); // hack to get druid.processing.bitmap property passed down to hadoop job. // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig. final String bitmapProperty = "druid.processing.bitmap.type"; final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty); if (bitmapType != null) { for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) { // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above String value = Strings.nullToEmpty(job.getConfiguration().get(property)); job.getConfiguration().set(property, String.format("-D%s=%s %s", bitmapProperty, bitmapType, value)); } } config.intoConfiguration(job); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), job); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }