Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:hk.newsRecommender.TFIDF2.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String hdfsUrl = conf.get("fs.defaultFS");

    //      part0----------------------------------------------------
    Job job0 = Job.getInstance(conf, "sfitSingleNews");
    Path output0Path = new Path(hdfsUrl + "/data/recommend/tfidf0");
    HadoopUtil.delete(conf, output0Path);
    job0.setJarByClass(TFIDF.class);
    job0.setMapperClass(Mapper_Part0.class);
    // job1.setCombinerClass(Combiner_Part1.class); // combiner?
    job0.setReducerClass(Reduce_Part0.class);
    job0.setMapOutputKeyClass(Text.class);
    job0.setMapOutputValueClass(Text.class);
    job0.setOutputKeyClass(Text.class);
    job0.setOutputValueClass(Text.class);
    // job1.setNumReduceTasks(p.length);
    FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/data2.txt"));
    FileOutputFormat.setOutputPath(job0, output0Path);
    job0.waitForCompletion(true);// w  w  w .jav  a2s  .  c om

    //      part1----------------------------------------------------
    Job job1 = Job.getInstance(conf, "computeTF");
    Path outputPath1 = new Path(hdfsUrl + "/data/recommend/tfidf1");
    HadoopUtil.delete(conf, outputPath1);
    job1.setJarByClass(TFIDF.class);
    job1.setMapperClass(Mapper_Part1.class);
    job1.setReducerClass(Reduce_Part1.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(Text.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(Text.class);
    job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner
    FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf0"));
    FileOutputFormat.setOutputPath(job1, outputPath1);
    job1.waitForCompletion(true);

    //      part2----------------------------------------
    Job job2 = Job.getInstance(conf, "computeTFIDF");
    Path outputPath2 = new Path(hdfsUrl + "/data/recommend/tfidf2");
    HadoopUtil.delete(conf, outputPath2);
    job2.setJarByClass(TFIDF.class);
    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(Text.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);
    job2.setMapperClass(Mapper_Part2.class);
    job2.setReducerClass(Reduce_Part2.class);
    FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/tfidf1"));
    FileOutputFormat.setOutputPath(job2, outputPath2);
    job2.waitForCompletion(true);

    //      part3----------------------------------------
    Configuration conf3 = new Configuration();
    Path outputPath3 = new Path(hdfsUrl + "/data/recommend/tfidf3");
    HadoopUtil.delete(conf, outputPath3);
    Job job3 = Job.getInstance(conf3, "My_tdif_part3");
    job3.setMapperClass(Mapper_Part3.class);
    job3.setReducerClass(Reduce_Part3.class);
    job3.setMapOutputKeyClass(CustomKey.class);
    job3.setMapOutputValueClass(NullWritable.class);
    job3.setOutputKeyClass(CustomKey.class);
    job3.setOutputValueClass(NullWritable.class);
    job3.setGroupingComparatorClass(CustomGroupComparator.class);
    job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner
    FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/tfidf2"));
    FileOutputFormat.setOutputPath(job3, outputPath3);
    job3.waitForCompletion(true);

}

From source file:hk.newsRecommender.TFIDFClassify.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String hdfsUrl = conf.get("fs.defaultFS");

    //      part1----------------------------------------------------
    Job job1 = Job.getInstance(conf, "computeTF");
    Path outputPath1 = new Path(hdfsUrl + "/data/recommend/class1/tfidf1");
    HadoopUtil.delete(conf, outputPath1);
    job1.setJarByClass(TFIDFClassify.class);
    job1.setMapperClass(Mapper_Part1.class);
    job1.setReducerClass(Reduce_Part1.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(Text.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(Text.class);
    job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner
    FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/data3.txt"));
    FileOutputFormat.setOutputPath(job1, outputPath1);
    job1.waitForCompletion(true);//from  ww w  .  ja  v  a 2  s  .c o m

    // part2----------------------------------------
    Job job2 = Job.getInstance(conf, "computIDF");
    Path outputPath2 = new Path(hdfsUrl + "/data/recommend/class1/tfidf2");
    HadoopUtil.delete(conf, outputPath2);
    job2.setJarByClass(TFIDFClassify.class);
    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(Text.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);
    job2.setMapperClass(Mapper_Part2.class);
    job2.setReducerClass(Reduce_Part2.class);
    FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/class1/tfidf1"));
    FileOutputFormat.setOutputPath(job2, outputPath2);
    job2.waitForCompletion(true);

    //      part3----------------------------------------
    Job job3 = Job.getInstance(conf, "sortByTFIDFDec");
    Path outputPath3 = new Path(hdfsUrl + "/data/recommend/class1/tfidf3");
    HadoopUtil.delete(conf, outputPath3);
    job3.setMapperClass(Mapper_Part3.class);
    job3.setReducerClass(Reduce_Part3.class);
    job3.setMapOutputKeyClass(CustomKey.class);
    job3.setMapOutputValueClass(NullWritable.class);
    job3.setOutputKeyClass(CustomKey.class);
    job3.setOutputValueClass(NullWritable.class);
    job3.setGroupingComparatorClass(CustomGroupComparator.class);
    job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner
    FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/class1/tfidf2"));
    FileOutputFormat.setOutputPath(job3, outputPath3);
    job3.waitForCompletion(true);

    //      part4---------------??-------------------------
    //      Job job4 = Job.getInstance(conf, "siftKeywords");
    //      Path outputPath4=new Path(hdfsUrl + "/data/recommend/class1/matrix1");
    //      HadoopUtil.delete(conf, outputPath4);
    //      job4.setJarByClass(TFIDF.class);
    //      job4.setMapperClass(Mapper_Part4.class);
    //      job4.setReducerClass(Reduce_Part4.class);
    //      job4.setMapOutputKeyClass(Text.class);
    //      job4.setMapOutputValueClass(Text.class);
    //      job4.setOutputKeyClass(Text.class);
    //      job4.setOutputValueClass(Text.class);
    //      job4.setPartitionerClass(CustomPartitioner.class);
    //      FileInputFormat.addInputPath(job4, new Path(hdfsUrl + "/data/recommend/class1/tfidf3"));
    //      FileOutputFormat.setOutputPath(job4, outputPath4);
    //      job4.waitForCompletion(true);

    //      part5----------------------------------------
    FileSystem fsopen = FileSystem.get(conf);
    FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000"));
    Scanner scan = new Scanner(in);
    List<String> keywordList = new ArrayList<String>();
    while (scan.hasNext()) {
        keywordList.add(scan.next());
    }
    //      must before job
    conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()]));
    Job job5 = Job.getInstance(conf, "generateMatrix");
    Path outputPath5 = new Path(hdfsUrl + "/data/recommend/class1/matrix2");
    HadoopUtil.delete(conf, outputPath5);
    job5.setJarByClass(TFIDF.class);
    job5.setMapperClass(Mapper_Part5.class);
    job5.setReducerClass(Reduce_Part5.class);
    job5.setMapOutputKeyClass(Text.class);
    job5.setMapOutputValueClass(Text.class);
    job5.setOutputKeyClass(Text.class);
    job5.setOutputValueClass(NullWritable.class);
    FileInputFormat.addInputPath(job5, new Path(hdfsUrl + "/data/recommend/class1/tfidf3"));
    FileOutputFormat.setOutputPath(job5, outputPath5);
    job5.waitForCompletion(true);

}

From source file:hr.fer.tel.rovkp.homework02.task02.Program.java

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: <jar> <input path> <output path>");
        return;//  w w w .j  a  v  a2 s .  c  om
    }

    Job job = Job.getInstance();
    job.setJarByClass(Program.class);
    job.setJobName("Locations");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(LocationsMapper.class);
    job.setPartitionerClass(LocationsPartitioner.class);
    job.setReducerClass(LocationsReducer.class);
    job.setNumReduceTasks(6);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class, NullWritable.class, Text.class);
    job.waitForCompletion(true);
}

From source file:hr.fer.tel.rovkp.homework02.task03.Program.java

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: <jar> <input path> <output path>");
        return;/*w ww  . jav  a 2s.com*/
    }

    Job firstJob = Job.getInstance();
    firstJob.setJarByClass(Program.class);
    firstJob.setJobName("Locations");

    FileInputFormat.addInputPath(firstJob, new Path(args[0]));
    FileOutputFormat.setOutputPath(firstJob, new Path(INTERMEDIATE_PATH));

    firstJob.setMapperClass(LocationsMapper.class);
    firstJob.setPartitionerClass(LocationsPartitioner.class);
    firstJob.setReducerClass(LocationsReducer.class);
    firstJob.setNumReduceTasks(6);

    firstJob.setOutputKeyClass(IntWritable.class);
    firstJob.setOutputValueClass(Text.class);

    MultipleOutputs.addNamedOutput(firstJob, "bins", TextOutputFormat.class, NullWritable.class, Text.class);

    int code = firstJob.waitForCompletion(true) ? 0 : 1;

    System.out.println("First job return code: " + code);

    if (code == 0) {

        Job job1 = run(INTERMEDIATE_PATH + "center1", args[1] + "/1");
        Job job2 = run(INTERMEDIATE_PATH + "not_center1", args[1] + "/2");
        Job job3 = run(INTERMEDIATE_PATH + "center2", args[1] + "/3");
        Job job4 = run(INTERMEDIATE_PATH + "not_center2", args[1] + "/4");
        Job job5 = run(INTERMEDIATE_PATH + "center4", args[1] + "/5");
        Job job6 = run(INTERMEDIATE_PATH + "not_center4", args[1] + "/6");

        while (!(job1.isComplete() && job2.isComplete() && job3.isComplete() && job4.isComplete()
                && job5.isComplete() && job6.isComplete())) {
            Thread.sleep(2000);
        }
    }
    FileSystem.get(firstJob.getConfiguration()).delete(new Path(INTERMEDIATE_PATH), true);
}

From source file:hw1.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from   ww  w .  ja  v a2s  .c  o  m
    }
    Job job = new Job(conf, "word count");
    /*
     * The line below can set the amount of reduce tasks to a specific
     * number, which equals to the amount of output 'part-r-...' files
     */
    // job.setNumReduceTasks(5);
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    // job.setCombinerClass(IntSumReducer.class);
    job.setPartitionerClass(CustomPartitioner.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:info.halo9pan.word2vec.hadoop.mr.WordSort.java

License:Apache License

public int run(String[] args) throws Exception {
    logger.info("starting");
    Job job = Job.getInstance(getConf());
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    boolean useSimplePartitioner = getUseSimplePartitioner(job);
    SortInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("WordSort");
    job.setJarByClass(WordSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(SortInputFormat.class);
    job.setOutputFormatClass(SortOutputFormat.class);
    if (useSimplePartitioner) {
        job.setPartitionerClass(SimplePartitioner.class);
    } else {//from w w w.ja va 2s . c  o  m
        long start = System.currentTimeMillis();
        Path partitionFile = new Path(outputDir, SortInputFormat.PARTITION_FILENAME);
        URI partitionUri = new URI(partitionFile.toString() + "#" + SortInputFormat.PARTITION_FILENAME);
        try {
            SortInputFormat.writePartitionFile(job, partitionFile);
        } catch (Throwable e) {
            logger.error(e.getMessage());
            return -1;
        }
        job.addCacheFile(partitionUri);
        long end = System.currentTimeMillis();
        System.out.println("Spent " + (end - start) + "ms computing partitions.");
        job.setPartitionerClass(TotalOrderPartitioner.class);
    }

    job.getConfiguration().setInt("dfs.replication", getOutputReplication(job));
    SortOutputFormat.setFinalSync(job, true);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    logger.info("done");
    return ret;
}

From source file:io.apigee.lembos.mapreduce.LembosMapReduceRunner.java

License:Apache License

/**
 * Returns a properly configured, ready to run Hadoop {@link Job}.
 *
 * @param args the command line arguments as supported by {@link GenericOptionsParser}
 *
 * @return the configured job/*from   w  w  w. j a  va 2 s  .  co m*/
 *
 * @throws IOException if there is a problem creating the job
 * @throws ExecutionException if there is an issue running the Node.js module
 * @throws InterruptedException if the execution of the Node.js module gets interrupted
 * @throws NodeException if there is an issue with the Node.js module
 */
public Job initJob(final String[] args)
        throws ExecutionException, InterruptedException, IOException, NodeException {
    final GenericOptionsParser gop = new GenericOptionsParser(args);

    // If ran from ToolRunner, conf should already be set but if not, set it manually
    if (conf == null) {
        setConf(gop.getConfiguration());
    }

    // Load the Hadoop FS URL handler
    RunnerUtils.loadFsUrlStreamHandler(getConf());

    // Persist the non-Runner CLI arguments
    conf.setStrings(LembosConstants.MR_MODULE_ARGS, gop.getRemainingArgs());

    // Package the Node.js module and prepare it to be submitted with the Job
    RunnerUtils.prepareModuleForJob(conf);

    // Add "-libjars" to the current ClassLoader if necessary
    RunnerUtils.addLibJarsToClassLoader(conf);

    // Create Node.js environment for local use
    mrEnv = LembosMapReduceEnvironment.fromConf(conf);

    if (JavaScriptUtils.isDefined(mrEnv.getConfiguration())) {
        for (final Map.Entry<Object, Object> propertyEntry : mrEnv.getConfiguration().entrySet()) {
            final String key = propertyEntry.getKey().toString();
            final Writable value = ConversionUtils.jsToWritable(propertyEntry.getValue(), mrEnv.getModule());

            // Do not set these as we'll be setting them later from values we were passed from the CLI
            if (key.equals(LembosConstants.MR_MODULE_NAME)) {
                continue;
            }

            if (value instanceof BooleanWritable) {
                conf.setBoolean(key, ((BooleanWritable) value).get());
            } else if (value instanceof DoubleWritable || value instanceof FloatWritable) {
                conf.setFloat(key, Float.valueOf(value.toString()));
            } else if (value instanceof IntWritable) {
                conf.setInt(key, ((IntWritable) value).get());
            } else if (value instanceof LongWritable) {
                conf.setLong(key, ((LongWritable) value).get());
            } else if (value instanceof Text) {
                conf.set(key, value.toString());
            } else {
                System.err.println("Cannot convert JavaScript (" + value.getClass().getName()
                        + ") to Configuration, using String");
                conf.set(key, value.toString());
            }
        }
    }

    // Create Job
    final String jobName = "LembosMapReduceJob-" + mrEnv.getModuleName();
    final Job job = new Job(conf, jobName);

    jobWrapper = JobWrap.getInstance(mrEnv.getRuntime(), job);

    if (JavaScriptUtils.isDefined(mrEnv.getJobSetupFunction())) {
        mrEnv.callFunctionSync(mrEnv.getJobSetupFunction(), new Object[] { jobWrapper });
    }

    // Always set the mapper
    job.setMapperClass(LembosMapper.class);

    // Conditionally set the combiner
    if (JavaScriptUtils.isDefined(mrEnv.getCombineFunction())) {
        job.setCombinerClass(LembosCombiner.class);
    }

    // Conditionally set the group comparator
    if (JavaScriptUtils.isDefined(mrEnv.getGroupFunction())) {
        job.setGroupingComparatorClass(LembosGroupComparator.class);
    }

    // Conditionally set the partitioner
    if (JavaScriptUtils.isDefined(mrEnv.getPartitionFunction())) {
        job.setPartitionerClass(LembosPartitioner.class);
    }

    // Conditionally set the reducer
    if (JavaScriptUtils.isDefined(mrEnv.getReduceFunction())) {
        job.setReducerClass(LembosReducer.class);
    } else {
        job.setNumReduceTasks(0);
    }

    // Conditionally set the sort comparator
    if (JavaScriptUtils.isDefined(mrEnv.getSortFunction())) {
        job.setSortComparatorClass(LembosSortComparator.class);
    }

    // This could potentially be unsafe but for testing, we need to set this based on the path to the built JAR
    if (job.getJar() == null) {
        job.setJarByClass(LembosMapReduceRunner.class);
    }

    // MapReduce configuration reference:
    //
    // http://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
    // org.apache.hadoop.mapreduce.MRConfig
    // org.apache.hadoop.mapreduce.MRJobConfig

    return job;
}

From source file:io.bfscan.clueweb12.LMRetrieval.java

License:Apache License

/**
 * Runs this tool.//from  ww w  .ja v  a  2  s . c om
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("input path (pfor format expected, add * to retrieve files)")
            .create(DOCVECTOR_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("queries").create(QUERIES_OPTION));
    options.addOption(
            OptionBuilder.withArgName("float").hasArg().withDescription("smoothing").create(SMOOTHING));
    options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK));
    options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
            .withDescription("preprocessing").create(PREPROCESSING));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(QUERIES_OPTION)
            || !cmdline.hasOption(SMOOTHING) || !cmdline.hasOption(TOPK) || !cmdline.hasOption(PREPROCESSING)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
    String queries = cmdline.getOptionValue(QUERIES_OPTION);
    String smoothing = cmdline.getOptionValue(SMOOTHING);
    String topk = cmdline.getOptionValue(TOPK);
    String preprocessing = cmdline.getOptionValue(PREPROCESSING);

    LOG.info("Tool name: " + LMRetrieval.class.getSimpleName());
    LOG.info(" - docvector: " + docvector);
    LOG.info(" - output: " + output);
    LOG.info(" - dictionary: " + dictionary);
    LOG.info(" - queries: " + queries);
    LOG.info(" - smoothing: " + smoothing);
    LOG.info(" - topk: " + topk);
    LOG.info(" - preprocessing: " + preprocessing);

    Configuration conf = getConf();
    conf.set(DICTIONARY_OPTION, dictionary);
    conf.set(QUERIES_OPTION, queries);
    conf.setFloat(SMOOTHING, Float.parseFloat(smoothing));
    conf.setInt(TOPK, Integer.parseInt(topk));
    conf.set(PREPROCESSING, preprocessing);

    conf.set("mapreduce.map.memory.mb", "10048");
    conf.set("mapreduce.map.java.opts", "-Xmx10048m");
    conf.set("mapreduce.reduce.memory.mb", "10048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx10048m");
    conf.set("mapred.task.timeout", "6000000"); // default is 600000

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(output))) {
        fs.delete(new Path(output), true);
    }

    Job job = new Job(conf, LMRetrieval.class.getSimpleName() + ":" + docvector);
    job.setJarByClass(LMRetrieval.class);

    FileInputFormat.setInputPaths(job, docvector);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(PairOfIntString.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:io.druid.indexer.DetermineHashedPartitionsJob.java

License:Apache License

public boolean run() {
    try {/*w  w w . j a va2s  .  c o  m*/
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */
        long startTime = System.currentTimeMillis();
        final Job groupByJob = Job.getInstance(new Configuration(), String
                .format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));

        JobHelper.injectSystemProperties(groupByJob);
        config.addJobProperties(groupByJob);
        groupByJob.setMapperClass(DetermineCardinalityMapper.class);
        groupByJob.setMapOutputKeyClass(LongWritable.class);
        groupByJob.setMapOutputValueClass(BytesWritable.class);
        groupByJob.setReducerClass(DetermineCardinalityReducer.class);
        groupByJob.setOutputKeyClass(NullWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
        if (!config.getSegmentGranularIntervals().isPresent()) {
            groupByJob.setNumReduceTasks(1);
        } else {
            groupByJob.setNumReduceTasks(config.getSegmentGranularIntervals().get().size());
        }
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

        groupByJob.submit();
        log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                groupByJob.getTrackingURL());

        if (!groupByJob.waitForCompletion(true)) {
            log.error("Job failed: %s", groupByJob.getJobID());
            return false;
        }

        /*
         * Load partitions and intervals determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        if (!config.getSegmentGranularIntervals().isPresent()) {
            final Path intervalInfoPath = config.makeIntervalInfoPath();
            fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
            if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
                throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
            }
            List<Interval> intervals = config.jsonMapper.readValue(
                    Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
                    });
            config.setGranularitySpec(
                    new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(),
                            config.getGranularitySpec().getQueryGranularity(), intervals));
            log.info("Determined Intervals for Job [%s]" + config.getSegmentGranularIntervals());
        }
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            DateTime bucket = segmentGranularity.getStart();

            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
            }
            if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
                final Long numRows = config.jsonMapper.readValue(
                        Utils.openInputStream(groupByJob, partitionInfoPath), new TypeReference<Long>() {
                        });

                log.info("Found approximately [%,d] rows in data.", numRows);

                final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());

                log.info("Creating [%,d] shards", numberOfShards);

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
                if (numberOfShards == 1) {
                    actualSpecs.add(new HadoopyShardSpec(new NoneShardSpec(), shardCount++));
                } else {
                    for (int i = 0; i < numberOfShards; ++i) {
                        actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards,
                                HadoopDruidIndexerConfig.jsonMapper), shardCount++));
                        log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                    }
                }

                shardSpecs.put(bucket, actualSpecs);

            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:io.druid.indexer.DeterminePartitionsJob.java

License:Apache License

public boolean run() {
    try {/*from  w  w w.j  a  va2  s. c om*/
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE(
                    "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
                    config.getPartitionsSpec());
        }

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = Job.getInstance(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);

            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                    JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i,
                            actualSpecs.get(i));
                }

                shardSpecs.put(segmentGranularity.getStart(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}