Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:InClass.ex2.HundredTimes.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount <in> [<in>...] <out>");
        System.exit(2);/*from www .  j a v  a  2 s. c  om*/
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(HundredTimes.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    //
    job.setOutputKeyClass(Text.class);

    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:info.halo9pan.word2vec.hadoop.Main.java

License:Open Source License

/**
 * @param args/*from   w  w  w .j a  v a  2s  .  c om*/
 */
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("hadoop.tmp.dir", (new Path("temp")).toUri().toString());
    GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
    String[] remainingArgs = optionParser.getRemainingArgs();
    if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
        System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
        System.exit(2);
    }
    Path input = new Path(remainingArgs[0]);
    String inputName = input.getName();
    Path countOutput = new Path(input.getParent(), inputName + "_count");
    Job countJob = Job.getInstance(conf, "Word Count");
    countJob.setJarByClass(Main.class);
    countJob.setMapperClass(ReadWordsMapper.class);
    countJob.setCombinerClass(ReadWordsReducer.class);
    countJob.setReducerClass(ReadWordsReducer.class);
    countJob.setOutputKeyClass(Text.class);
    countJob.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(countJob, input);
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(countOutput))
        fs.delete(countOutput, true);
    FileOutputFormat.setOutputPath(countJob, countOutput);
    countJob.waitForCompletion(true);

    System.exit(countJob.waitForCompletion(true) ? 0 : 1);
}

From source file:io.aos.mapreduce.grep.GrepTool.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        org.apache.hadoop.util.Tool t;
        return 2;
    }/*from  w ww. j  a va 2  s  .  c o  m*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Configuration conf = getConf();
    conf.set(RegexMapper.PATTERN, args[2]);

    if (args.length == 4) {
        conf.set(RegexMapper.GROUP, args[3]);
    }

    try {

        Job greJob = Job.getInstance(conf);
        greJob.setJobName("GrepSearch");

        FileInputFormat.setInputPaths(greJob, args[0]);

        greJob.setMapperClass(RegexMapper.class);
        greJob.setCombinerClass(LongSumReducer.class);
        greJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(greJob, tempDir);
        greJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        greJob.setOutputKeyClass(Text.class);
        greJob.setOutputValueClass(LongWritable.class);

        greJob.waitForCompletion(true);

        Job sortJob = Job.getInstance(conf);
        sortJob.setJobName("GrepSort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormatClass(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        // Write a single file
        sortJob.setNumReduceTasks(1);

        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setSortComparatorClass( // sort by decreasing freq

                LongWritable.DecreasingComparator.class);

        sortJob.waitForCompletion(true);

    }

    catch (Exception e) {
        return 2;
    }

    finally {
        FileSystem.get(conf).delete(tempDir, true);
    }

    return 0;

}

From source file:io.aos.t4f.hadoop.mapreduce.WordCountMapReduceTest2.java

License:Apache License

public static int main(String... args) throws Exception {

    // Get the default configuration object
    Configuration conf = new Configuration();

    // Add resources
    conf.addResource("hdfs-default.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-default.xml");
    conf.addResource("mapred-site.xml");

    Job job = new Job(conf);
    job.setJobName("WordCount");

    List<String> other_args = parseArguments(args, job);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MapClass.class);
    job.setCombinerClass(ReduceClass.class);
    job.setReducerClass(ReduceClass.class);

    // Set the input format class
    job.setInputFormatClass(TextInputFormat.class);
    // Set the output format class
    job.setOutputFormatClass(TextOutputFormat.class);
    // Set the input path
    TextInputFormat.setInputPaths(job, other_args.get(0));
    // Set the output path
    TextOutputFormat.setOutputPath(job, new Path(other_args.get(1)));

    /*/*from w  w  w  .j  a  v  a 2  s .  c  om*/
     * Set the minimum and maximum split sizes This parameter helps to
     * specify the number of map tasks. For each input split, there will be
     * a separate map task. In this example each split is of size 32 MB
     */
    TextInputFormat.setMinInputSplitSize(job, 32 * MEGABYTES);
    TextInputFormat.setMaxInputSplitSize(job, 32 * MEGABYTES);

    // Set the jar file to run
    job.setJarByClass(WordCountMapReduceTest2.class);

    // Submit the job
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int exitCode = job.waitForCompletion(true) ? 0 : 1;

    if (exitCode == 0) {
        Date end_time = new Date();
        System.out.println("Job ended: " + end_time);
        System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    } else {
        System.out.println("Job Failed!!!");
    }

    return exitCode;

}

From source file:io.apigee.lembos.mapreduce.LembosMapReduceRunner.java

License:Apache License

/**
 * Returns a properly configured, ready to run Hadoop {@link Job}.
 *
 * @param args the command line arguments as supported by {@link GenericOptionsParser}
 *
 * @return the configured job//  www.j a  v a 2 s  .  co  m
 *
 * @throws IOException if there is a problem creating the job
 * @throws ExecutionException if there is an issue running the Node.js module
 * @throws InterruptedException if the execution of the Node.js module gets interrupted
 * @throws NodeException if there is an issue with the Node.js module
 */
public Job initJob(final String[] args)
        throws ExecutionException, InterruptedException, IOException, NodeException {
    final GenericOptionsParser gop = new GenericOptionsParser(args);

    // If ran from ToolRunner, conf should already be set but if not, set it manually
    if (conf == null) {
        setConf(gop.getConfiguration());
    }

    // Load the Hadoop FS URL handler
    RunnerUtils.loadFsUrlStreamHandler(getConf());

    // Persist the non-Runner CLI arguments
    conf.setStrings(LembosConstants.MR_MODULE_ARGS, gop.getRemainingArgs());

    // Package the Node.js module and prepare it to be submitted with the Job
    RunnerUtils.prepareModuleForJob(conf);

    // Add "-libjars" to the current ClassLoader if necessary
    RunnerUtils.addLibJarsToClassLoader(conf);

    // Create Node.js environment for local use
    mrEnv = LembosMapReduceEnvironment.fromConf(conf);

    if (JavaScriptUtils.isDefined(mrEnv.getConfiguration())) {
        for (final Map.Entry<Object, Object> propertyEntry : mrEnv.getConfiguration().entrySet()) {
            final String key = propertyEntry.getKey().toString();
            final Writable value = ConversionUtils.jsToWritable(propertyEntry.getValue(), mrEnv.getModule());

            // Do not set these as we'll be setting them later from values we were passed from the CLI
            if (key.equals(LembosConstants.MR_MODULE_NAME)) {
                continue;
            }

            if (value instanceof BooleanWritable) {
                conf.setBoolean(key, ((BooleanWritable) value).get());
            } else if (value instanceof DoubleWritable || value instanceof FloatWritable) {
                conf.setFloat(key, Float.valueOf(value.toString()));
            } else if (value instanceof IntWritable) {
                conf.setInt(key, ((IntWritable) value).get());
            } else if (value instanceof LongWritable) {
                conf.setLong(key, ((LongWritable) value).get());
            } else if (value instanceof Text) {
                conf.set(key, value.toString());
            } else {
                System.err.println("Cannot convert JavaScript (" + value.getClass().getName()
                        + ") to Configuration, using String");
                conf.set(key, value.toString());
            }
        }
    }

    // Create Job
    final String jobName = "LembosMapReduceJob-" + mrEnv.getModuleName();
    final Job job = new Job(conf, jobName);

    jobWrapper = JobWrap.getInstance(mrEnv.getRuntime(), job);

    if (JavaScriptUtils.isDefined(mrEnv.getJobSetupFunction())) {
        mrEnv.callFunctionSync(mrEnv.getJobSetupFunction(), new Object[] { jobWrapper });
    }

    // Always set the mapper
    job.setMapperClass(LembosMapper.class);

    // Conditionally set the combiner
    if (JavaScriptUtils.isDefined(mrEnv.getCombineFunction())) {
        job.setCombinerClass(LembosCombiner.class);
    }

    // Conditionally set the group comparator
    if (JavaScriptUtils.isDefined(mrEnv.getGroupFunction())) {
        job.setGroupingComparatorClass(LembosGroupComparator.class);
    }

    // Conditionally set the partitioner
    if (JavaScriptUtils.isDefined(mrEnv.getPartitionFunction())) {
        job.setPartitionerClass(LembosPartitioner.class);
    }

    // Conditionally set the reducer
    if (JavaScriptUtils.isDefined(mrEnv.getReduceFunction())) {
        job.setReducerClass(LembosReducer.class);
    } else {
        job.setNumReduceTasks(0);
    }

    // Conditionally set the sort comparator
    if (JavaScriptUtils.isDefined(mrEnv.getSortFunction())) {
        job.setSortComparatorClass(LembosSortComparator.class);
    }

    // This could potentially be unsafe but for testing, we need to set this based on the path to the built JAR
    if (job.getJar() == null) {
        job.setJarByClass(LembosMapReduceRunner.class);
    }

    // MapReduce configuration reference:
    //
    // http://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
    // org.apache.hadoop.mapreduce.MRConfig
    // org.apache.hadoop.mapreduce.MRJobConfig

    return job;
}

From source file:io.bfscan.clueweb09.ComputeTermStatistics.java

License:Apache License

/**
 * Runs this tool./*from www.j a v  a 2s .  c o  m*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("minimum df").create(DF_MIN_OPTION));
    options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
            .withDescription("preprocessing").create(PREPROCESSING));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(PREPROCESSING)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String preprocessing = cmdline.getOptionValue(PREPROCESSING);

    LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);
    LOG.info(" - preprocessing: " + preprocessing);

    getConf().set(PREPROCESSING, preprocessing);

    Job job = new Job(getConf(), ComputeTermStatistics.class.getSimpleName() + ":" + input);
    job.setJarByClass(ComputeTermStatistics.class);

    job.setNumReduceTasks(100);

    if (cmdline.hasOption(DF_MIN_OPTION)) {
        int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION));
        LOG.info(" - dfMin: " + dfMin);
        job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin);
    }

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(ClueWeb09InputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    FileSystem.get(getConf()).delete(new Path(output), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:io.bfscan.clueweb12.ComputeTermStatistics.java

License:Apache License

/**
 * Runs this tool.//  w  w w .j  av a 2s . c o m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("minimum df").create(DF_MIN_OPTION));
    options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
            .withDescription("preprocessing").create(PREPROCESSING));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(PREPROCESSING)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String preprocessing = cmdline.getOptionValue(PREPROCESSING);

    LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);
    LOG.info(" - preprocessing: " + preprocessing);

    getConf().set(PREPROCESSING, preprocessing);

    Job job = new Job(getConf(), ComputeTermStatistics.class.getSimpleName() + ":" + input);
    job.setJarByClass(ComputeTermStatistics.class);

    job.setNumReduceTasks(100);

    if (cmdline.hasOption(DF_MIN_OPTION)) {
        int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION));
        LOG.info(" - dfMin: " + dfMin);
        job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin);
    }

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(ClueWeb12InputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    FileSystem.get(getConf()).delete(new Path(output), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:io.bfscan.clueweb12.MergeTermStatistics.java

License:Apache License

/**
 * Runs this tool.// w  w  w .  j  a  v  a  2s . c o  m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("minimum df").create(DF_MIN_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);

    LOG.info("Tool name: " + MergeTermStatistics.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);

    Job job = new Job(getConf(), MergeTermStatistics.class.getSimpleName() + ":" + input);
    job.setJarByClass(MergeTermStatistics.class);

    job.setNumReduceTasks(100);

    if (cmdline.hasOption(DF_MIN_OPTION)) {
        int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION));
        LOG.info(" - dfMin: " + dfMin);
        job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin);
    }

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    FileSystem.get(getConf()).delete(new Path(output), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:io.druid.indexer.DeterminePartitionsJob.java

License:Apache License

public boolean run() {
    try {//from w ww .j  a v a  2  s  .  c  om
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE(
                    "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
                    config.getPartitionsSpec());
        }

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = Job.getInstance(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);

            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                    JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i,
                            actualSpecs.get(i));
                }

                shardSpecs.put(segmentGranularity.getStart(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:io.druid.indexer.IndexGeneratorJob.java

License:Apache License

public boolean run() {
    try {//from   w  ww .  j a  v  a  2  s . c  om
        Job job = Job.getInstance(new Configuration(),
                String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals()));

        job.getConfiguration().set("io.sort.record.percent", "0.23");

        JobHelper.injectSystemProperties(job);
        config.addJobProperties(job);

        job.setMapperClass(IndexGeneratorMapper.class);
        job.setMapOutputValueClass(BytesWritable.class);

        SortableBytes.useSortableBytesAsMapOutputKey(job);

        int numReducers = Iterables.size(config.getAllBuckets().get());
        if (numReducers == 0) {
            throw new RuntimeException("No buckets?? seems there is no data to index.");
        }

        if (config.getSchema().getTuningConfig().getUseCombiner()) {
            job.setCombinerClass(IndexGeneratorCombiner.class);
            job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class);
        }

        job.setNumReduceTasks(numReducers);
        job.setPartitionerClass(IndexGeneratorPartitioner.class);

        setReducerClass(job);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(IndexGeneratorOutputFormat.class);
        FileOutputFormat.setOutputPath(job, config.makeIntermediatePath());

        config.addInputPaths(job);

        // hack to get druid.processing.bitmap property passed down to hadoop job.
        // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig.
        final String bitmapProperty = "druid.processing.bitmap.type";
        final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty);
        if (bitmapType != null) {
            for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) {
                // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above
                String value = Strings.nullToEmpty(job.getConfiguration().get(property));
                job.getConfiguration().set(property,
                        String.format("-D%s=%s %s", bitmapProperty, bitmapType, value));
            }
        }

        config.intoConfiguration(job);

        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), job);

        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());

        boolean success = job.waitForCompletion(true);

        Counter invalidRowCount = job.getCounters()
                .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER);
        jobStats.setInvalidRowCount(invalidRowCount.getValue());

        return success;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}