List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:edu.umd.shrawanraina.PartitionGraph.java
License:Apache License
/** * Runs this tool./*from w w w . ja v a 2 s. co m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(RANGE, "use range partitioner")); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions") .create(NUM_PARTITIONS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES) || !cmdline.hasOption(NUM_PARTITIONS)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inPath = cmdline.getOptionValue(INPUT); String outPath = cmdline.getOptionValue(OUTPUT); int nodeCount = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); int numParts = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS)); boolean useRange = cmdline.hasOption(RANGE); LOG.info("Tool name: " + PartitionGraph.class.getSimpleName()); LOG.info(" - input dir: " + inPath); LOG.info(" - output dir: " + outPath); LOG.info(" - num partitions: " + numParts); LOG.info(" - node cnt: " + nodeCount); LOG.info(" - use range partitioner: " + useRange); Configuration conf = getConf(); conf.setInt("NodeCount", nodeCount); Job job = Job.getInstance(conf); job.setJobName(PartitionGraph.class.getSimpleName() + ":" + inPath); job.setJarByClass(PartitionGraph.class); job.setNumReduceTasks(numParts); FileInputFormat.setInputPaths(job, new Path(inPath)); FileOutputFormat.setOutputPath(job, new Path(outPath)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNodeUpd.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNodeUpd.class); if (useRange) { job.setPartitionerClass(RangePartitioner.class); } FileSystem.get(conf).delete(new Path(outPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.windmemory.BuildInvertedIndexCompressed.java
License:Apache License
/** * Runs this tool./*from w ww . j ava 2 s . com*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool name: " + BuildInvertedIndexCompressed.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - num reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(BuildInvertedIndexCompressed.class.getSimpleName()); job.setJarByClass(BuildInvertedIndexCompressed.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(PairOfStrings.class); job.setMapOutputValueClass(IntWritable.class); // job.setOutputKeyClass(PairOfStrings.class); // job.setOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ArrayListWritable.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setMapperClass(MyMapper.class); job.setPartitionerClass(MyPartitioner.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.PMIPairs.java
License:Apache License
/** * Runs this tool./* w ww . j a v a 2s. co m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairs.class.getSimpleName()); job.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairs.class.getSimpleName()); job2.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", "temp"); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(PairOfStrings.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.PMIPairsR.java
License:Apache License
/** * Runs this tool.//from w w w .j a v a 2 s . co m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairsR.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairsR.class.getSimpleName()); job.setJarByClass(PMIPairsR.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); // job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairsR.class.getSimpleName()); job2.setJarByClass(PMIPairsR.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", "temp"); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(PairOfStrings.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); // job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.PMIStripes.java
License:Apache License
/** * Runs this tool.//from w w w . j a v a 2 s . co m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairs.class.getSimpleName()); job.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairs.class.getSimpleName()); job2.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", interDir.toString()); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(HMapStIW.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java
License:Open Source License
private boolean runSummary(Path bamPath) throws IOException, ClassNotFoundException, InterruptedException { final Configuration conf = getConf(); Utils.configureSampling(wrkDir, bamPath.getName(), conf); final Job job = new Job(conf); job.setJarByClass(Summarize.class); job.setMapperClass(Mapper.class); job.setReducerClass(SummarizeReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Range.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(RangeCount.class); job.setInputFormatClass(SummarizeInputFormat.class); job.setOutputFormatClass(SummarizeOutputFormat.class); FileInputFormat.setInputPaths(job, bamPath); FileOutputFormat.setOutputPath(job, wrkDir); job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("summarize :: Sampling..."); t.start();// w w w.ja va2 s. c om InputSampler.<LongWritable, Range>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, Range>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("summarize :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); for (String lvl : levels) { MultipleOutputs.addNamedOutput(job, getSummaryName(lvl, false), SummarizeOutputFormat.class, NullWritable.class, Range.class); MultipleOutputs.addNamedOutput(job, getSummaryName(lvl, true), SummarizeOutputFormat.class, NullWritable.class, Range.class); } job.submit(); System.out.println("summarize :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("summarize :: Job failed."); return false; } System.out.printf("summarize :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); return true; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.SummarySort.java
License:Open Source License
static Job sortOne(Configuration conf, Path inputFile, Path outputDir, String commandName, String samplingInfo) throws IOException, ClassNotFoundException, InterruptedException { conf.set(Utils.WORK_FILENAME_PROPERTY, inputFile.getName()); Utils.configureSampling(outputDir, inputFile.getName(), conf); final Job job = new Job(conf); job.setJarByClass(Summarize.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SortInputFormat.class); job.setOutputFormatClass(SortOutputFormat.class); FileInputFormat.setInputPaths(job, inputFile); FileOutputFormat.setOutputPath(job, outputDir); job.setPartitionerClass(TotalOrderPartitioner.class); final Timer t = new Timer(); System.out.printf("%s :: Sampling%s...\n", commandName, samplingInfo); t.start();/* www .j a va 2 s.c o m*/ InputSampler.<LongWritable, Text>writePartitionFile(job, new InputSampler.SplitSampler<LongWritable, Text>( Math.max(1 << 16, conf.getInt("mapred.reduce.tasks", 1)), 10)); System.out.printf("%s :: Sampling complete in %d.%03d s.\n", commandName, t.stopS(), t.fms()); job.submit(); return job; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.FixMate.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("fixmate :: WORKDIR not given."); return 3; }// w w w .j av a 2 s .c o m if (args.size() == 1) { System.err.println("fixmate :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue( stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate"); if (stringency == null) return 3; Path wrkDir = new Path(args.get(0)); final List<String> strInputs = args.subList(1, args.size()); final List<Path> inputs = new ArrayList<Path>(strInputs.size()); for (final String in : strInputs) inputs.add(new Path(in)); final Configuration conf = getConf(); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); if (stringency != null) conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); final boolean globalSort = parser.getBoolean(sortOpt); if (globalSort) Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname); conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0])); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); if (globalSort) Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(FixMate.class); job.setMapperClass(FixMateMapper.class); job.setReducerClass(FixMateReducer.class); if (!parser.getBoolean(noCombinerOpt)) job.setCombinerClass(FixMateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SAMRecordWritable.class); job.setInputFormatClass(AnySAMInputFormat.class); job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class); for (final Path in : inputs) FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, wrkDir); if (globalSort) { job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("fixmate :: Sampling..."); t.start(); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); } job.submit(); System.out.println("fixmate :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("fixmate :: Job failed."); return 4; } System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("fixmate :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate"); } catch (IOException e) { System.err.printf("fixmate :: Output merging failed: %s\n", e); return 5; } return 0; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.Sort.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("sort :: WORKDIR not given."); return 3; }// w w w. j a va 2 s.c om if (args.size() == 1) { System.err.println("sort :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue( stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "sort"); if (stringency == null) return 3; Path wrkDir = new Path(args.get(0)); final List<String> strInputs = args.subList(1, args.size()); final List<Path> inputs = new ArrayList<Path>(strInputs.size()); for (final String in : strInputs) inputs.add(new Path(in)); final Configuration conf = getConf(); Utils.setHeaderMergerSortOrder(conf, SortOrder.coordinate); conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0])); if (stringency != null) conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setInputFormatClass(SortInputFormat.class); job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class); for (final Path in : inputs) FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, wrkDir); job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("sort :: Sampling..."); t.start(); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); job.submit(); System.out.println("sort :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("sort :: Job failed."); return 4; } System.out.printf("sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("sort :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "sort"); } catch (IOException e) { System.err.printf("sort :: Output merging failed: %s\n", e); return 5; } return 0; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.VCFSort.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("vcf-sort :: WORKDIR not given."); return 3; }//from w w w. jav a 2 s.c o m if (args.size() == 1) { System.err.println("vcf-sort :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; Path wrkDir = new Path(args.get(0)); final Path inPath = new Path(args.get(1)); final Configuration conf = getConf(); VCFFormat vcfFormat = null; final String f = (String) parser.getOptionValue(formatOpt); if (f != null) { try { vcfFormat = VCFFormat.valueOf(f.toUpperCase(Locale.ENGLISH)); } catch (IllegalArgumentException e) { System.err.printf("%s :: invalid format '%s'\n", getCommandName(), f); return 3; } } if (vcfFormat == null) vcfFormat = outPath == null ? VCFFormat.BCF : VCFFormat.inferFromFilePath(outPath); conf.setBoolean(VCFInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt)); conf.setBoolean(KeyIgnoringVCFOutputFormat.WRITE_HEADER_PROPERTY, outPath == null); conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, vcfFormat.toString()); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inPath : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); conf.set(SortOutputFormat.INPUT_PATH_PROP, inPath.toString()); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(VCFSort.class); job.setMapperClass(Mapper.class); job.setReducerClass(VCFSortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VariantContextWritable.class); job.setInputFormatClass(VCFInputFormat.class); job.setOutputFormatClass(SortOutputFormat.class); FileInputFormat.addInputPath(job, inPath); FileOutputFormat.setOutputPath(job, wrkDir); job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("vcf-sort :: Sampling..."); t.start(); InputSampler.<LongWritable, VariantContextWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("vcf-sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); job.submit(); System.out.println("vcf-sort :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("vcf-sort :: Job failed."); return 4; } System.out.printf("vcf-sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("vcf-sort :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { System.out.println("vcf-sort :: Merging output..."); t.start(); final OutputStream outs = outPath.getFileSystem(conf).create(outPath); // First, place the VCF or BCF header. final WrapSeekable ins = WrapSeekable.openPath(conf, inPath); final VCFHeader header = VCFHeaderReader.readHeaderFrom(ins); ins.close(); final VariantContextWriter writer; switch (vcfFormat) { case VCF: writer = VariantContextWriterFactory.create(new FilterOutputStream(outs) { @Override public void close() throws IOException { this.out.flush(); } }, null, VariantContextWriterFactory.NO_OPTIONS); break; case BCF: writer = VariantContextWriterFactory .create(new FilterOutputStream(new BlockCompressedOutputStream(outs, null)) { @Override public void close() throws IOException { this.out.flush(); } }, null, EnumSet.of(Options.FORCE_BCF)); break; default: assert false; writer = null; break; } writer.writeHeader(header); writer.close(); // Then, the actual VCF or BCF contents. Utils.mergeInto(outs, wrkDir, "", "", conf, "vcf-sort"); // And if BCF, the BGZF terminator. if (vcfFormat == VCFFormat.BCF) outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); outs.close(); System.out.printf("vcf-sort :: Merging complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("vcf-sort :: Output merging failed: %s\n", e); return 5; } return 0; }