List of usage examples for org.apache.hadoop.mapreduce.lib.partition TotalOrderPartitioner setPartitionFile
public static void setPartitionFile(Configuration conf, Path p)
From source file:org.apache.kylin.storage.hbase.steps.HFileOutputFormat3.java
License:Apache License
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. *//* www .jav a 2s . c om*/ static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file FileSystem fs = FileSystem.get(conf); Path partitionsPath = new Path(conf.get("hbase.fs.tmp.dir"), "partitions_" + RandomUtil.randomUUID()); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, splitPoints); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
From source file:org.apache.phoenix.mapreduce.MultiHfileOutputFormat.java
License:Apache License
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. *//*from w w w . ja v a 2 s. c o m*/ static void configurePartitioner(Job job, Set<TableRowkeyPair> tablesStartKeys) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file Path partitionsPath = new Path(conf.get("hadoop.tmp.dir"), "partitions_" + UUID.randomUUID()); FileSystem fs = partitionsPath.getFileSystem(conf); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, tablesStartKeys); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
From source file:org.bgi.flexlab.gaea.util.SortUilts.java
License:Open Source License
public static void configureSampling(Path outPath, BioJob job, VCFSortOptions options) throws IOException { Configuration conf = job.getConfiguration(); final Path partition = outPath.getFileSystem(conf) .makeQualified(new Path(outPath, "_partitioning" + "VCF")); TotalOrderPartitioner.setPartitionFile(conf, partition); try {/*from www.j a v a2 s . c o m*/ final URI partitionURI = new URI(partition.toString() + "#" + partition.getName()); if (partitionURI.getScheme().equals("file")) return; ReferenceShare.distributeCache(partitionURI.toString(), job); } catch (URISyntaxException e) { throw new RuntimeException(e); } }
From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java
License:Open Source License
public int run(String[] args) throws Exception { final Configuration conf = getConf(); SortVcfOptions options = new SortVcfOptions(args); conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat()); conf.setBoolean("hadoopbam.vcf.write-header", false); Path inputPath = new Path(options.getInput()); FileSystem fs = inputPath.getFileSystem(conf); FileStatus[] files = fs.listStatus(inputPath); Path vcfHeaderPath = files[0].getPath(); if (options.getVcfHeader() != null) vcfHeaderPath = new Path(options.getVcfHeader()); if (files.length <= 0) { System.err.println("Input dir is empty!"); return 1; }/*from w w w . java 2 s . c om*/ conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString()); conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName()); KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf); baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf)); VCFHeader vcfHeader = baseOF.getHeader(); Job job = Job.getInstance(conf, "VCFSort"); job.setJarByClass(SortVcf.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortVcfReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VariantContextWritable.class); job.setInputFormatClass(VCFInputFormat.class); job.setOutputFormatClass(MyVCFOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setNumReduceTasks(options.getReducerNum()); SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss"); String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date()); Path partTmp = new Path(tmpDir + "/temp"); VCFInputFormat.addInputPath(job, inputPath); if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job)) VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE); FileOutputFormat.setOutputPath(job, partTmp); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class); Path partitionFile; if (options.getPartitionFileString() == null) { partitionFile = new Path(tmpDir + "/_partitons.lst"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); System.out.println("vcf-sort :: Sampling..."); int numSamples = options.getNumSamples(); if (fs.getContentSummary(inputPath).getLength() < 10000000) { numSamples = 1; job.setNumReduceTasks(1); } InputSampler.writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples, numSamples)); } else { System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ..."); partitionFile = new Path(options.getPartitionFileString()); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); } if (!job.waitForCompletion(true)) { System.err.println("sort :: Job failed."); return 1; } final FileSystem srcFS = partTmp.getFileSystem(conf); Path headerPath = new Path(tmpDir + "/header.vcf.gz"); BGZFCodec bgzfCodec = new BGZFCodec(); OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath)); VariantContextWriterBuilder builder = new VariantContextWriterBuilder(); VariantContextWriter writer; writer = builder.setOutputVCFStream(new FilterOutputStream(os) { @Override public void close() throws IOException { this.out.flush(); } }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build(); writer.writeHeader(vcfHeader); os.close(); Path outputPath = new Path(options.getOutput()); final FileSystem dstFS = outputPath.getFileSystem(conf); OutputStream vcfgz = dstFS.create(outputPath); final FSDataInputStream headerIns = srcFS.open(headerPath); IOUtils.copyBytes(headerIns, vcfgz, conf, false); headerIns.close(); final FileStatus[] parts = partTmp.getFileSystem(conf) .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*")); for (FileStatus p : parts) { final FSDataInputStream ins = srcFS.open(p.getPath()); IOUtils.copyBytes(ins, vcfgz, conf, false); ins.close(); } vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); vcfgz.close(); partTmp.getFileSystem(conf).delete(partTmp, true); return 0; }
From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java
License:Open Source License
protected Job createJob(String[] args) throws Exception { Job job = new Job(); job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis()); Configuration conf = job.getConfiguration(); this.setup(args, conf); Path input = new Path(this.inputPath); FileInputFormat.addInputPath(job, input); Path outputPath = new Path(this.outputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(ArchiveToCDXFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); conf.set("map.output.key.field.separator", ""); conf.set("cdx.format", this.cdxFormat); conf.set("cdx.hdfs", Boolean.toString(this.hdfs)); conf.set("cdx.metatag", this.metaTag); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); // General config: job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(this.numReducers); job.setJarByClass(ArchiveCDXGenerator.class); // POST directly to the tinycdxserver: if (this.cdxserver != null) { conf.set("tinycdxserver.endpoint", this.cdxserver); conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size); // Perform the update in the Map phase (difficult to control number // of clients) // job.setMapperClass(TinyCDXServerMapper.class); // job.setReducerClass(Reducer.class); // Perform the update in the reduce phase: job.setMapperClass(Mapper.class); job.setReducerClass(TinyCDXServerReducer.class); } else {/* www. ja va2s . c o m*/ // Default to the pass-through mapper and reducer: job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); // Set up the split: if (this.splitFile != null) { log.info("Setting splitFile to " + this.splitFile); AlphaPartitioner.setPartitionPath(conf, this.splitFile); job.setPartitionerClass(AlphaPartitioner.class); } else { job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(outputPath, "_partitions.lst")); // FIXME This probably won't work - need to update to recent API JobConf jc = new JobConf(conf); InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000)); } } FileSystem fs = input.getFileSystem(conf); FileStatus inputStatus = fs.getFileStatus(input); FileInputFormat.setMaxInputSplitSize(job, inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf)); return job; }