Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:edu.umd.shrawanraina.PartitionGraph.java

License:Apache License

/**
 * Runs this tool./*from   w w  w  . ja v  a 2 s.  co  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(RANGE, "use range partitioner"));

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions")
            .create(NUM_PARTITIONS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)
            || !cmdline.hasOption(NUM_PARTITIONS)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inPath = cmdline.getOptionValue(INPUT);
    String outPath = cmdline.getOptionValue(OUTPUT);
    int nodeCount = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    int numParts = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS));
    boolean useRange = cmdline.hasOption(RANGE);

    LOG.info("Tool name: " + PartitionGraph.class.getSimpleName());
    LOG.info(" - input dir: " + inPath);
    LOG.info(" - output dir: " + outPath);
    LOG.info(" - num partitions: " + numParts);
    LOG.info(" - node cnt: " + nodeCount);
    LOG.info(" - use range partitioner: " + useRange);

    Configuration conf = getConf();
    conf.setInt("NodeCount", nodeCount);

    Job job = Job.getInstance(conf);
    job.setJobName(PartitionGraph.class.getSimpleName() + ":" + inPath);
    job.setJarByClass(PartitionGraph.class);

    job.setNumReduceTasks(numParts);

    FileInputFormat.setInputPaths(job, new Path(inPath));
    FileOutputFormat.setOutputPath(job, new Path(outPath));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNodeUpd.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNodeUpd.class);

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    FileSystem.get(conf).delete(new Path(outPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.windmemory.BuildInvertedIndexCompressed.java

License:Apache License

/**
 * Runs this tool./*from  w  ww  . j ava 2  s  . com*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + BuildInvertedIndexCompressed.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(BuildInvertedIndexCompressed.class.getSimpleName());
    job.setJarByClass(BuildInvertedIndexCompressed.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(PairOfStrings.class);
    job.setMapOutputValueClass(IntWritable.class);
    // job.setOutputKeyClass(PairOfStrings.class);
    // job.setOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ArrayListWritable.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.windmemory.PMIPairs.java

License:Apache License

/**
* Runs this tool./*  w ww  . j  a  v a  2s.  co m*/
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + PMIPairs.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PMIPairs.class.getSimpleName());
    job.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path interDir = new Path("temp");
    FileSystem.get(getConf()).delete(interDir, true);

    // job.setNumMapTasks(reduceTasks);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, interDir);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyFirstMapper.class);
    job.setCombinerClass(MyFirstReducer.class);
    job.setReducerClass(MyFirstReducer.class);
    job.setPartitionerClass(MyFirstPartitioner.class);

    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PMIPairs.class.getSimpleName());
    job2.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    // job2.getConfiguration().set("path", "temp");
    // job2.getConfiguration().setInt("num", reduceTasks);

    job2.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));

    job2.setMapOutputKeyClass(PairOfStrings.class);
    job2.setMapOutputValueClass(IntWritable.class);
    job2.setOutputKeyClass(PairOfStrings.class);
    job2.setOutputValueClass(DoubleWritable.class);

    job2.setMapperClass(MySecondMapper.class);
    job2.setCombinerClass(MySecondCombiner.class);
    job2.setReducerClass(MySecondReducer.class);
    job2.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job2.addCacheFile(new URI("temp/part-r-00000"));
    job.waitForCompletion(true);
    job2.waitForCompletion(true);
    // FileSystem.get(getConf()).delete(interDir, true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.windmemory.PMIPairsR.java

License:Apache License

/**
* Runs this tool.//from  w w w .j  a  v a  2 s .  co  m
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + PMIPairsR.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PMIPairsR.class.getSimpleName());
    job.setJarByClass(PMIPairsR.class);
    // Delete the output directory if it exists already.
    Path interDir = new Path("temp");
    FileSystem.get(getConf()).delete(interDir, true);

    // job.setNumMapTasks(reduceTasks);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, interDir);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyFirstMapper.class);
    // job.setCombinerClass(MyFirstReducer.class);
    job.setReducerClass(MyFirstReducer.class);
    job.setPartitionerClass(MyFirstPartitioner.class);

    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PMIPairsR.class.getSimpleName());
    job2.setJarByClass(PMIPairsR.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    // job2.getConfiguration().set("path", "temp");
    // job2.getConfiguration().setInt("num", reduceTasks);

    job2.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));

    job2.setMapOutputKeyClass(PairOfStrings.class);
    job2.setMapOutputValueClass(IntWritable.class);
    job2.setOutputKeyClass(PairOfStrings.class);
    job2.setOutputValueClass(DoubleWritable.class);

    job2.setMapperClass(MySecondMapper.class);
    // job2.setCombinerClass(MySecondCombiner.class);
    job2.setReducerClass(MySecondReducer.class);
    job2.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job2.addCacheFile(new URI("temp/part-r-00000"));
    job.waitForCompletion(true);
    job2.waitForCompletion(true);
    // FileSystem.get(getConf()).delete(interDir, true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.windmemory.PMIStripes.java

License:Apache License

/**
 * Runs this tool.//from   w  w  w . j  a  v  a 2 s . co  m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + PMIPairs.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PMIPairs.class.getSimpleName());
    job.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path interDir = new Path("temp");
    FileSystem.get(getConf()).delete(interDir, true);

    // job.setNumMapTasks(reduceTasks);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, interDir);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyFirstMapper.class);
    job.setCombinerClass(MyFirstReducer.class);
    job.setReducerClass(MyFirstReducer.class);
    job.setPartitionerClass(MyFirstPartitioner.class);

    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PMIPairs.class.getSimpleName());
    job2.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    // job2.getConfiguration().set("path", interDir.toString());
    // job2.getConfiguration().setInt("num", reduceTasks);

    job2.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));

    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(HMapStIW.class);
    job2.setOutputKeyClass(PairOfStrings.class);
    job2.setOutputValueClass(DoubleWritable.class);

    job2.setMapperClass(MySecondMapper.class);
    job2.setCombinerClass(MySecondCombiner.class);
    job2.setReducerClass(MySecondReducer.class);
    job2.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job2.addCacheFile(new URI("temp/part-r-00000"));
    job.waitForCompletion(true);
    job2.waitForCompletion(true);
    // FileSystem.get(getConf()).delete(interDir, true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java

License:Open Source License

private boolean runSummary(Path bamPath) throws IOException, ClassNotFoundException, InterruptedException {
    final Configuration conf = getConf();
    Utils.configureSampling(wrkDir, bamPath.getName(), conf);
    final Job job = new Job(conf);

    job.setJarByClass(Summarize.class);
    job.setMapperClass(Mapper.class);
    job.setReducerClass(SummarizeReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Range.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(RangeCount.class);

    job.setInputFormatClass(SummarizeInputFormat.class);
    job.setOutputFormatClass(SummarizeOutputFormat.class);

    FileInputFormat.setInputPaths(job, bamPath);
    FileOutputFormat.setOutputPath(job, wrkDir);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    System.out.println("summarize :: Sampling...");
    t.start();// w w  w.ja  va2 s.  c om

    InputSampler.<LongWritable, Range>writePartitionFile(job,
            new InputSampler.RandomSampler<LongWritable, Range>(0.01, 10000, Math.max(100, reduceTasks)));

    System.out.printf("summarize :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());

    for (String lvl : levels) {
        MultipleOutputs.addNamedOutput(job, getSummaryName(lvl, false), SummarizeOutputFormat.class,
                NullWritable.class, Range.class);
        MultipleOutputs.addNamedOutput(job, getSummaryName(lvl, true), SummarizeOutputFormat.class,
                NullWritable.class, Range.class);
    }

    job.submit();

    System.out.println("summarize :: Waiting for job completion...");
    t.start();

    if (!job.waitForCompletion(verbose)) {
        System.err.println("summarize :: Job failed.");
        return false;
    }

    System.out.printf("summarize :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());
    return true;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.SummarySort.java

License:Open Source License

static Job sortOne(Configuration conf, Path inputFile, Path outputDir, String commandName,
        String samplingInfo) throws IOException, ClassNotFoundException, InterruptedException {
    conf.set(Utils.WORK_FILENAME_PROPERTY, inputFile.getName());
    Utils.configureSampling(outputDir, inputFile.getName(), conf);
    final Job job = new Job(conf);

    job.setJarByClass(Summarize.class);
    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(SortInputFormat.class);
    job.setOutputFormatClass(SortOutputFormat.class);

    FileInputFormat.setInputPaths(job, inputFile);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    final Timer t = new Timer();

    System.out.printf("%s :: Sampling%s...\n", commandName, samplingInfo);
    t.start();/* www .j a  va 2 s.c o  m*/

    InputSampler.<LongWritable, Text>writePartitionFile(job, new InputSampler.SplitSampler<LongWritable, Text>(
            Math.max(1 << 16, conf.getInt("mapred.reduce.tasks", 1)), 10));

    System.out.printf("%s :: Sampling complete in %d.%03d s.\n", commandName, t.stopS(), t.fms());
    job.submit();
    return job;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.FixMate.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("fixmate :: WORKDIR not given.");
        return 3;
    }//  w  w  w .j  av a 2 s .c o  m
    if (args.size() == 1) {
        System.err.println("fixmate :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue(
            stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    final boolean globalSort = parser.getBoolean(sortOpt);
    if (globalSort)
        Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname);

    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        if (globalSort)
            Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(FixMate.class);
        job.setMapperClass(FixMateMapper.class);
        job.setReducerClass(FixMateReducer.class);

        if (!parser.getBoolean(noCombinerOpt))
            job.setCombinerClass(FixMateReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(AnySAMInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        if (globalSort) {
            job.setPartitionerClass(TotalOrderPartitioner.class);

            System.out.println("fixmate :: Sampling...");
            t.start();

            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                    new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                            Math.max(100, reduceTasks)));

            System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());
        }

        job.submit();

        System.out.println("fixmate :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("fixmate :: Job failed.");
            return 4;
        }

        System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("fixmate :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate");
        } catch (IOException e) {
            System.err.printf("fixmate :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.Sort.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("sort :: WORKDIR not given.");
        return 3;
    }//  w w  w. j a  va  2 s.c om
    if (args.size() == 1) {
        System.err.println("sort :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue(
            stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "sort");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    Utils.setHeaderMergerSortOrder(conf, SortOrder.coordinate);
    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(Sort.class);
        job.setMapperClass(Mapper.class);
        job.setReducerClass(SortReducer.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(SortInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        job.setPartitionerClass(TotalOrderPartitioner.class);

        System.out.println("sort :: Sampling...");
        t.start();

        InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                        Math.max(100, reduceTasks)));

        System.out.printf("sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());

        job.submit();

        System.out.println("sort :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("sort :: Job failed.");
            return 4;
        }

        System.out.printf("sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("sort :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "sort");
        } catch (IOException e) {
            System.err.printf("sort :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.VCFSort.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("vcf-sort :: WORKDIR not given.");
        return 3;
    }//from   w w w.  jav  a 2  s.c o  m
    if (args.size() == 1) {
        System.err.println("vcf-sort :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    Path wrkDir = new Path(args.get(0));
    final Path inPath = new Path(args.get(1));

    final Configuration conf = getConf();

    VCFFormat vcfFormat = null;

    final String f = (String) parser.getOptionValue(formatOpt);
    if (f != null) {
        try {
            vcfFormat = VCFFormat.valueOf(f.toUpperCase(Locale.ENGLISH));
        } catch (IllegalArgumentException e) {
            System.err.printf("%s :: invalid format '%s'\n", getCommandName(), f);
            return 3;
        }
    }
    if (vcfFormat == null)
        vcfFormat = outPath == null ? VCFFormat.BCF : VCFFormat.inferFromFilePath(outPath);

    conf.setBoolean(VCFInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt));

    conf.setBoolean(KeyIgnoringVCFOutputFormat.WRITE_HEADER_PROPERTY, outPath == null);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, vcfFormat.toString());

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inPath : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    conf.set(SortOutputFormat.INPUT_PATH_PROP, inPath.toString());

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(VCFSort.class);
        job.setMapperClass(Mapper.class);
        job.setReducerClass(VCFSortReducer.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(VariantContextWritable.class);

        job.setInputFormatClass(VCFInputFormat.class);
        job.setOutputFormatClass(SortOutputFormat.class);

        FileInputFormat.addInputPath(job, inPath);
        FileOutputFormat.setOutputPath(job, wrkDir);

        job.setPartitionerClass(TotalOrderPartitioner.class);

        System.out.println("vcf-sort :: Sampling...");
        t.start();

        InputSampler.<LongWritable, VariantContextWritable>writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.01, 10000,
                        Math.max(100, reduceTasks)));

        System.out.printf("vcf-sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());

        job.submit();

        System.out.println("vcf-sort :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("vcf-sort :: Job failed.");
            return 4;
        }

        System.out.printf("vcf-sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("vcf-sort :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            System.out.println("vcf-sort :: Merging output...");
            t.start();

            final OutputStream outs = outPath.getFileSystem(conf).create(outPath);

            // First, place the VCF or BCF header.

            final WrapSeekable ins = WrapSeekable.openPath(conf, inPath);
            final VCFHeader header = VCFHeaderReader.readHeaderFrom(ins);
            ins.close();

            final VariantContextWriter writer;

            switch (vcfFormat) {
            case VCF:
                writer = VariantContextWriterFactory.create(new FilterOutputStream(outs) {
                    @Override
                    public void close() throws IOException {
                        this.out.flush();
                    }
                }, null, VariantContextWriterFactory.NO_OPTIONS);
                break;

            case BCF:
                writer = VariantContextWriterFactory
                        .create(new FilterOutputStream(new BlockCompressedOutputStream(outs, null)) {
                            @Override
                            public void close() throws IOException {
                                this.out.flush();
                            }
                        }, null, EnumSet.of(Options.FORCE_BCF));
                break;

            default:
                assert false;
                writer = null;
                break;
            }

            writer.writeHeader(header);
            writer.close();

            // Then, the actual VCF or BCF contents.
            Utils.mergeInto(outs, wrkDir, "", "", conf, "vcf-sort");

            // And if BCF, the BGZF terminator.
            if (vcfFormat == VCFFormat.BCF)
                outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

            outs.close();

            System.out.printf("vcf-sort :: Merging complete in %d.%03d s.\n", t.stopS(), t.fms());

        } catch (IOException e) {
            System.err.printf("vcf-sort :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}