Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:hk.newsRecommender.TFIDF2.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String hdfsUrl = conf.get("fs.defaultFS");

    //      part0----------------------------------------------------
    Job job0 = Job.getInstance(conf, "sfitSingleNews");
    Path output0Path = new Path(hdfsUrl + "/data/recommend/tfidf0");
    HadoopUtil.delete(conf, output0Path);
    job0.setJarByClass(TFIDF.class);
    job0.setMapperClass(Mapper_Part0.class);
    // job1.setCombinerClass(Combiner_Part1.class); // combiner?
    job0.setReducerClass(Reduce_Part0.class);
    job0.setMapOutputKeyClass(Text.class);
    job0.setMapOutputValueClass(Text.class);
    job0.setOutputKeyClass(Text.class);
    job0.setOutputValueClass(Text.class);
    // job1.setNumReduceTasks(p.length);
    FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/data2.txt"));
    FileOutputFormat.setOutputPath(job0, output0Path);
    job0.waitForCompletion(true);// w  w  w .jav  a2s  .  c om

    //      part1----------------------------------------------------
    Job job1 = Job.getInstance(conf, "computeTF");
    Path outputPath1 = new Path(hdfsUrl + "/data/recommend/tfidf1");
    HadoopUtil.delete(conf, outputPath1);
    job1.setJarByClass(TFIDF.class);
    job1.setMapperClass(Mapper_Part1.class);
    job1.setReducerClass(Reduce_Part1.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(Text.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(Text.class);
    job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner
    FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf0"));
    FileOutputFormat.setOutputPath(job1, outputPath1);
    job1.waitForCompletion(true);

    //      part2----------------------------------------
    Job job2 = Job.getInstance(conf, "computeTFIDF");
    Path outputPath2 = new Path(hdfsUrl + "/data/recommend/tfidf2");
    HadoopUtil.delete(conf, outputPath2);
    job2.setJarByClass(TFIDF.class);
    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(Text.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);
    job2.setMapperClass(Mapper_Part2.class);
    job2.setReducerClass(Reduce_Part2.class);
    FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/tfidf1"));
    FileOutputFormat.setOutputPath(job2, outputPath2);
    job2.waitForCompletion(true);

    //      part3----------------------------------------
    Configuration conf3 = new Configuration();
    Path outputPath3 = new Path(hdfsUrl + "/data/recommend/tfidf3");
    HadoopUtil.delete(conf, outputPath3);
    Job job3 = Job.getInstance(conf3, "My_tdif_part3");
    job3.setMapperClass(Mapper_Part3.class);
    job3.setReducerClass(Reduce_Part3.class);
    job3.setMapOutputKeyClass(CustomKey.class);
    job3.setMapOutputValueClass(NullWritable.class);
    job3.setOutputKeyClass(CustomKey.class);
    job3.setOutputValueClass(NullWritable.class);
    job3.setGroupingComparatorClass(CustomGroupComparator.class);
    job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner
    FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/tfidf2"));
    FileOutputFormat.setOutputPath(job3, outputPath3);
    job3.waitForCompletion(true);

}

From source file:hk.newsRecommender.TFIDFClassify.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String hdfsUrl = conf.get("fs.defaultFS");

    //      part1----------------------------------------------------
    Job job1 = Job.getInstance(conf, "computeTF");
    Path outputPath1 = new Path(hdfsUrl + "/data/recommend/class1/tfidf1");
    HadoopUtil.delete(conf, outputPath1);
    job1.setJarByClass(TFIDFClassify.class);
    job1.setMapperClass(Mapper_Part1.class);
    job1.setReducerClass(Reduce_Part1.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(Text.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(Text.class);
    job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner
    FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/data3.txt"));
    FileOutputFormat.setOutputPath(job1, outputPath1);
    job1.waitForCompletion(true);//from  ww w  .  ja  v  a 2  s  .c o m

    // part2----------------------------------------
    Job job2 = Job.getInstance(conf, "computIDF");
    Path outputPath2 = new Path(hdfsUrl + "/data/recommend/class1/tfidf2");
    HadoopUtil.delete(conf, outputPath2);
    job2.setJarByClass(TFIDFClassify.class);
    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(Text.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);
    job2.setMapperClass(Mapper_Part2.class);
    job2.setReducerClass(Reduce_Part2.class);
    FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/class1/tfidf1"));
    FileOutputFormat.setOutputPath(job2, outputPath2);
    job2.waitForCompletion(true);

    //      part3----------------------------------------
    Job job3 = Job.getInstance(conf, "sortByTFIDFDec");
    Path outputPath3 = new Path(hdfsUrl + "/data/recommend/class1/tfidf3");
    HadoopUtil.delete(conf, outputPath3);
    job3.setMapperClass(Mapper_Part3.class);
    job3.setReducerClass(Reduce_Part3.class);
    job3.setMapOutputKeyClass(CustomKey.class);
    job3.setMapOutputValueClass(NullWritable.class);
    job3.setOutputKeyClass(CustomKey.class);
    job3.setOutputValueClass(NullWritable.class);
    job3.setGroupingComparatorClass(CustomGroupComparator.class);
    job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner
    FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/class1/tfidf2"));
    FileOutputFormat.setOutputPath(job3, outputPath3);
    job3.waitForCompletion(true);

    //      part4---------------??-------------------------
    //      Job job4 = Job.getInstance(conf, "siftKeywords");
    //      Path outputPath4=new Path(hdfsUrl + "/data/recommend/class1/matrix1");
    //      HadoopUtil.delete(conf, outputPath4);
    //      job4.setJarByClass(TFIDF.class);
    //      job4.setMapperClass(Mapper_Part4.class);
    //      job4.setReducerClass(Reduce_Part4.class);
    //      job4.setMapOutputKeyClass(Text.class);
    //      job4.setMapOutputValueClass(Text.class);
    //      job4.setOutputKeyClass(Text.class);
    //      job4.setOutputValueClass(Text.class);
    //      job4.setPartitionerClass(CustomPartitioner.class);
    //      FileInputFormat.addInputPath(job4, new Path(hdfsUrl + "/data/recommend/class1/tfidf3"));
    //      FileOutputFormat.setOutputPath(job4, outputPath4);
    //      job4.waitForCompletion(true);

    //      part5----------------------------------------
    FileSystem fsopen = FileSystem.get(conf);
    FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000"));
    Scanner scan = new Scanner(in);
    List<String> keywordList = new ArrayList<String>();
    while (scan.hasNext()) {
        keywordList.add(scan.next());
    }
    //      must before job
    conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()]));
    Job job5 = Job.getInstance(conf, "generateMatrix");
    Path outputPath5 = new Path(hdfsUrl + "/data/recommend/class1/matrix2");
    HadoopUtil.delete(conf, outputPath5);
    job5.setJarByClass(TFIDF.class);
    job5.setMapperClass(Mapper_Part5.class);
    job5.setReducerClass(Reduce_Part5.class);
    job5.setMapOutputKeyClass(Text.class);
    job5.setMapOutputValueClass(Text.class);
    job5.setOutputKeyClass(Text.class);
    job5.setOutputValueClass(NullWritable.class);
    FileInputFormat.addInputPath(job5, new Path(hdfsUrl + "/data/recommend/class1/tfidf3"));
    FileOutputFormat.setOutputPath(job5, outputPath5);
    job5.waitForCompletion(true);

}

From source file:hr.fer.tel.rovkp.homework02.task02.Program.java

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: <jar> <input path> <output path>");
        return;//  w w w .j  a  v  a2 s .  c  om
    }

    Job job = Job.getInstance();
    job.setJarByClass(Program.class);
    job.setJobName("Locations");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(LocationsMapper.class);
    job.setPartitionerClass(LocationsPartitioner.class);
    job.setReducerClass(LocationsReducer.class);
    job.setNumReduceTasks(6);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class, NullWritable.class, Text.class);
    job.waitForCompletion(true);
}

From source file:hr.fer.tel.rovkp.homework02.task03.Program.java

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: <jar> <input path> <output path>");
        return;/*w ww  . jav  a 2s.com*/
    }

    Job firstJob = Job.getInstance();
    firstJob.setJarByClass(Program.class);
    firstJob.setJobName("Locations");

    FileInputFormat.addInputPath(firstJob, new Path(args[0]));
    FileOutputFormat.setOutputPath(firstJob, new Path(INTERMEDIATE_PATH));

    firstJob.setMapperClass(LocationsMapper.class);
    firstJob.setPartitionerClass(LocationsPartitioner.class);
    firstJob.setReducerClass(LocationsReducer.class);
    firstJob.setNumReduceTasks(6);

    firstJob.setOutputKeyClass(IntWritable.class);
    firstJob.setOutputValueClass(Text.class);

    MultipleOutputs.addNamedOutput(firstJob, "bins", TextOutputFormat.class, NullWritable.class, Text.class);

    int code = firstJob.waitForCompletion(true) ? 0 : 1;

    System.out.println("First job return code: " + code);

    if (code == 0) {

        Job job1 = run(INTERMEDIATE_PATH + "center1", args[1] + "/1");
        Job job2 = run(INTERMEDIATE_PATH + "not_center1", args[1] + "/2");
        Job job3 = run(INTERMEDIATE_PATH + "center2", args[1] + "/3");
        Job job4 = run(INTERMEDIATE_PATH + "not_center2", args[1] + "/4");
        Job job5 = run(INTERMEDIATE_PATH + "center4", args[1] + "/5");
        Job job6 = run(INTERMEDIATE_PATH + "not_center4", args[1] + "/6");

        while (!(job1.isComplete() && job2.isComplete() && job3.isComplete() && job4.isComplete()
                && job5.isComplete() && job6.isComplete())) {
            Thread.sleep(2000);
        }
    }
    FileSystem.get(firstJob.getConfiguration()).delete(new Path(INTERMEDIATE_PATH), true);
}

From source file:hw1.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from   ww  w .  ja  v a2s  .c  o  m
    }
    Job job = new Job(conf, "word count");
    /*
     * The line below can set the amount of reduce tasks to a specific
     * number, which equals to the amount of output 'part-r-...' files
     */
    // job.setNumReduceTasks(5);
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    // job.setCombinerClass(IntSumReducer.class);
    job.setPartitionerClass(CustomPartitioner.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:info.halo9pan.word2vec.hadoop.mr.WordSort.java

License:Apache License

public int run(String[] args) throws Exception {
    logger.info("starting");
    Job job = Job.getInstance(getConf());
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    boolean useSimplePartitioner = getUseSimplePartitioner(job);
    SortInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("WordSort");
    job.setJarByClass(WordSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(SortInputFormat.class);
    job.setOutputFormatClass(SortOutputFormat.class);
    if (useSimplePartitioner) {
        job.setPartitionerClass(SimplePartitioner.class);
    } else {//from w w w.ja va 2s . c  o  m
        long start = System.currentTimeMillis();
        Path partitionFile = new Path(outputDir, SortInputFormat.PARTITION_FILENAME);
        URI partitionUri = new URI(partitionFile.toString() + "#" + SortInputFormat.PARTITION_FILENAME);
        try {
            SortInputFormat.writePartitionFile(job, partitionFile);
        } catch (Throwable e) {
            logger.error(e.getMessage());
            return -1;
        }
        job.addCacheFile(partitionUri);
        long end = System.currentTimeMillis();
        System.out.println("Spent " + (end - start) + "ms computing partitions.");
        job.setPartitionerClass(TotalOrderPartitioner.class);
    }

    job.getConfiguration().setInt("dfs.replication", getOutputReplication(job));
    SortOutputFormat.setFinalSync(job, true);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    logger.info("done");
    return ret;
}

From source file:io.apigee.lembos.mapreduce.LembosMapReduceRunner.java

License:Apache License

/**
 * Returns a properly configured, ready to run Hadoop {@link Job}.
 *
 * @param args the command line arguments as supported by {@link GenericOptionsParser}
 *
 * @return the configured job/*from   w  w  w. j a  va 2 s  .  co m*/
 *
 * @throws IOException if there is a problem creating the job
 * @throws ExecutionException if there is an issue running the Node.js module
 * @throws InterruptedException if the execution of the Node.js module gets interrupted
 * @throws NodeException if there is an issue with the Node.js module
 */
public Job initJob(final String[] args)
        throws ExecutionException, InterruptedException, IOException, NodeException {
    final GenericOptionsParser gop = new GenericOptionsParser(args);

    // If ran from ToolRunner, conf should already be set but if not, set it manually
    if (conf == null) {
        setConf(gop.getConfiguration());
    }

    // Load the Hadoop FS URL handler
    RunnerUtils.loadFsUrlStreamHandler(getConf());

    // Persist the non-Runner CLI arguments
    conf.setStrings(LembosConstants.MR_MODULE_ARGS, gop.getRemainingArgs());

    // Package the Node.js module and prepare it to be submitted with the Job
    RunnerUtils.prepareModuleForJob(conf);

    // Add "-libjars" to the current ClassLoader if necessary
    RunnerUtils.addLibJarsToClassLoader(conf);

    // Create Node.js environment for local use
    mrEnv = LembosMapReduceEnvironment.fromConf(conf);

    if (JavaScriptUtils.isDefined(mrEnv.getConfiguration())) {
        for (final Map.Entry<Object, Object> propertyEntry : mrEnv.getConfiguration().entrySet()) {
            final String key = propertyEntry.getKey().toString();
            final Writable value = ConversionUtils.jsToWritable(propertyEntry.getValue(), mrEnv.getModule());

            // Do not set these as we'll be setting them later from values we were passed from the CLI
            if (key.equals(LembosConstants.MR_MODULE_NAME)) {
                continue;
            }

            if (value instanceof BooleanWritable) {
                conf.setBoolean(key, ((BooleanWritable) value).get());
            } else if (value instanceof DoubleWritable || value instanceof FloatWritable) {
                conf.setFloat(key, Float.valueOf(value.toString()));
            } else if (value instanceof IntWritable) {
                conf.setInt(key, ((IntWritable) value).get());
            } else if (value instanceof LongWritable) {
                conf.setLong(key, ((LongWritable) value).get());
            } else if (value instanceof Text) {
                conf.set(key, value.toString());
            } else {
                System.err.println("Cannot convert JavaScript (" + value.getClass().getName()
                        + ") to Configuration, using String");
                conf.set(key, value.toString());
            }
        }
    }

    // Create Job
    final String jobName = "LembosMapReduceJob-" + mrEnv.getModuleName();
    final Job job = new Job(conf, jobName);

    jobWrapper = JobWrap.getInstance(mrEnv.getRuntime(), job);

    if (JavaScriptUtils.isDefined(mrEnv.getJobSetupFunction())) {
        mrEnv.callFunctionSync(mrEnv.getJobSetupFunction(), new Object[] { jobWrapper });
    }

    // Always set the mapper
    job.setMapperClass(LembosMapper.class);

    // Conditionally set the combiner
    if (JavaScriptUtils.isDefined(mrEnv.getCombineFunction())) {
        job.setCombinerClass(LembosCombiner.class);
    }

    // Conditionally set the group comparator
    if (JavaScriptUtils.isDefined(mrEnv.getGroupFunction())) {
        job.setGroupingComparatorClass(LembosGroupComparator.class);
    }

    // Conditionally set the partitioner
    if (JavaScriptUtils.isDefined(mrEnv.getPartitionFunction())) {
        job.setPartitionerClass(LembosPartitioner.class);
    }

    // Conditionally set the reducer
    if (JavaScriptUtils.isDefined(mrEnv.getReduceFunction())) {
        job.setReducerClass(LembosReducer.class);
    } else {
        job.setNumReduceTasks(0);
    }

    // Conditionally set the sort comparator
    if (JavaScriptUtils.isDefined(mrEnv.getSortFunction())) {
        job.setSortComparatorClass(LembosSortComparator.class);
    }

    // This could potentially be unsafe but for testing, we need to set this based on the path to the built JAR
    if (job.getJar() == null) {
        job.setJarByClass(LembosMapReduceRunner.class);
    }

    // MapReduce configuration reference:
    //
    // http://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
    // org.apache.hadoop.mapreduce.MRConfig
    // org.apache.hadoop.mapreduce.MRJobConfig

    return job;
}

From source file:io.bfscan.clueweb12.LMRetrieval.java

License:Apache License

/**
 * Runs this tool.//from  ww w  .ja v  a  2  s . c om
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("input path (pfor format expected, add * to retrieve files)")
            .create(DOCVECTOR_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("queries").create(QUERIES_OPTION));
    options.addOption(
            OptionBuilder.withArgName("float").hasArg().withDescription("smoothing").create(SMOOTHING));
    options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK));
    options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
            .withDescription("preprocessing").create(PREPROCESSING));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(QUERIES_OPTION)
            || !cmdline.hasOption(SMOOTHING) || !cmdline.hasOption(TOPK) || !cmdline.hasOption(PREPROCESSING)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
    String queries = cmdline.getOptionValue(QUERIES_OPTION);
    String smoothing = cmdline.getOptionValue(SMOOTHING);
    String topk = cmdline.getOptionValue(TOPK);
    String preprocessing = cmdline.getOptionValue(PREPROCESSING);

    LOG.info("Tool name: " + LMRetrieval.class.getSimpleName());
    LOG.info(" - docvector: " + docvector);
    LOG.info(" - output: " + output);
    LOG.info(" - dictionary: " + dictionary);
    LOG.info(" - queries: " + queries);
    LOG.info(" - smoothing: " + smoothing);
    LOG.info(" - topk: " + topk);
    LOG.info(" - preprocessing: " + preprocessing);

    Configuration conf = getConf();
    conf.set(DICTIONARY_OPTION, dictionary);
    conf.set(QUERIES_OPTION, queries);
    conf.setFloat(SMOOTHING, Float.parseFloat(smoothing));
    conf.setInt(TOPK, Integer.parseInt(topk));
    conf.set(PREPROCESSING, preprocessing);

    conf.set("mapreduce.map.memory.mb", "10048");
    conf.set("mapreduce.map.java.opts", "-Xmx10048m");
    conf.set("mapreduce.reduce.memory.mb", "10048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx10048m");
    conf.set("mapred.task.timeout", "6000000"); // default is 600000

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(output))) {
        fs.delete(new Path(output), true);
    }

    Job job = new Job(conf, LMRetrieval.class.getSimpleName() + ":" + docvector);
    job.setJarByClass(LMRetrieval.class);

    FileInputFormat.setInputPaths(job, docvector);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(PairOfIntString.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:io.druid.indexer.DetermineHashedPartitionsJob.java

License:Apache License

public boolean run() {
    try {/*w  w w . j a va2s  .  c o  m*/
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */
        long startTime = System.currentTimeMillis();
        final Job groupByJob = Job.getInstance(new Configuration(), String
                .format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));

        JobHelper.injectSystemProperties(groupByJob);
        config.addJobProperties(groupByJob);
        groupByJob.setMapperClass(DetermineCardinalityMapper.class);
        groupByJob.setMapOutputKeyClass(LongWritable.class);
        groupByJob.setMapOutputValueClass(BytesWritable.class);
        groupByJob.setReducerClass(DetermineCardinalityReducer.class);
        groupByJob.setOutputKeyClass(NullWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
        if (!config.getSegmentGranularIntervals().isPresent()) {
            groupByJob.setNumReduceTasks(1);
        } else {
            groupByJob.setNumReduceTasks(config.getSegmentGranularIntervals().get().size());
        }
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

        groupByJob.submit();
        log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                groupByJob.getTrackingURL());

        if (!groupByJob.waitForCompletion(true)) {
            log.error("Job failed: %s", groupByJob.getJobID());
            return false;
        }

        /*
         * Load partitions and intervals determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        if (!config.getSegmentGranularIntervals().isPresent()) {
            final Path intervalInfoPath = config.makeIntervalInfoPath();
            fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
            if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
                throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
            }
            List<Interval> intervals = config.jsonMapper.readValue(
                    Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
                    });
            config.setGranularitySpec(
                    new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(),
                            config.getGranularitySpec().getQueryGranularity(), intervals));
            log.info("Determined Intervals for Job [%s]" + config.getSegmentGranularIntervals());
        }
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            DateTime bucket = segmentGranularity.getStart();

            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
            }
            if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
                final Long numRows = config.jsonMapper.readValue(
                        Utils.openInputStream(groupByJob, partitionInfoPath), new TypeReference<Long>() {
                        });

                log.info("Found approximately [%,d] rows in data.", numRows);

                final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());

                log.info("Creating [%,d] shards", numberOfShards);

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
                if (numberOfShards == 1) {
                    actualSpecs.add(new HadoopyShardSpec(new NoneShardSpec(), shardCount++));
                } else {
                    for (int i = 0; i < numberOfShards; ++i) {
                        actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards,
                                HadoopDruidIndexerConfig.jsonMapper), shardCount++));
                        log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                    }
                }

                shardSpecs.put(bucket, actualSpecs);

            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:io.druid.indexer.DeterminePartitionsJob.java

License:Apache License

public boolean run() {
    try {/*from  w  w w.j  a  va2  s. c om*/
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE(
                    "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
                    config.getPartitionsSpec());
        }

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = Job.getInstance(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);

            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                    JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i,
                            actualSpecs.get(i));
                }

                shardSpecs.put(segmentGranularity.getStart(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}