Example usage for org.apache.hadoop.mapreduce.lib.partition TotalOrderPartitioner setPartitionFile

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.partition TotalOrderPartitioner setPartitionFile.

Prototype

public static void setPartitionFile(Configuration conf, Path p)

Source Link

Document

Set the path to the SequenceFile storing the sorted partition keyset.

Usage

From source file:org.apache.kylin.storage.hbase.steps.HFileOutputFormat3.java

License:Apache License

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 *//* www  .jav a  2s  .  c om*/
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints) throws IOException {
    Configuration conf = job.getConfiguration();
    // create the partitions file
    FileSystem fs = FileSystem.get(conf);
    Path partitionsPath = new Path(conf.get("hbase.fs.tmp.dir"), "partitions_" + RandomUtil.randomUUID());
    fs.makeQualified(partitionsPath);
    writePartitions(conf, partitionsPath, splitPoints);
    fs.deleteOnExit(partitionsPath);

    // configure job to use it
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

From source file:org.apache.phoenix.mapreduce.MultiHfileOutputFormat.java

License:Apache License

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 *//*from w  w  w . ja  v  a 2  s. c  o m*/
static void configurePartitioner(Job job, Set<TableRowkeyPair> tablesStartKeys) throws IOException {

    Configuration conf = job.getConfiguration();
    // create the partitions file
    Path partitionsPath = new Path(conf.get("hadoop.tmp.dir"), "partitions_" + UUID.randomUUID());
    FileSystem fs = partitionsPath.getFileSystem(conf);
    fs.makeQualified(partitionsPath);
    writePartitions(conf, partitionsPath, tablesStartKeys);
    fs.deleteOnExit(partitionsPath);

    // configure job to use it
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

From source file:org.bgi.flexlab.gaea.util.SortUilts.java

License:Open Source License

public static void configureSampling(Path outPath, BioJob job, VCFSortOptions options) throws IOException {
    Configuration conf = job.getConfiguration();
    final Path partition = outPath.getFileSystem(conf)
            .makeQualified(new Path(outPath, "_partitioning" + "VCF"));

    TotalOrderPartitioner.setPartitionFile(conf, partition);
    try {/*from www.j a  v  a2  s  . c  o m*/
        final URI partitionURI = new URI(partition.toString() + "#" + partition.getName());

        if (partitionURI.getScheme().equals("file"))
            return;

        ReferenceShare.distributeCache(partitionURI.toString(), job);
    } catch (URISyntaxException e) {
        throw new RuntimeException(e);
    }

}

From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java

License:Open Source License

public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    SortVcfOptions options = new SortVcfOptions(args);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat());
    conf.setBoolean("hadoopbam.vcf.write-header", false);

    Path inputPath = new Path(options.getInput());
    FileSystem fs = inputPath.getFileSystem(conf);
    FileStatus[] files = fs.listStatus(inputPath);

    Path vcfHeaderPath = files[0].getPath();
    if (options.getVcfHeader() != null)
        vcfHeaderPath = new Path(options.getVcfHeader());

    if (files.length <= 0) {
        System.err.println("Input dir is empty!");
        return 1;
    }/*from   w  w  w  . java 2 s  . c om*/
    conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString());
    conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName());

    KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf);

    baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf));
    VCFHeader vcfHeader = baseOF.getHeader();

    Job job = Job.getInstance(conf, "VCFSort");
    job.setJarByClass(SortVcf.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortVcfReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setOutputFormatClass(MyVCFOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(options.getReducerNum());

    SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
    String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date());
    Path partTmp = new Path(tmpDir + "/temp");
    VCFInputFormat.addInputPath(job, inputPath);
    if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job))
        VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE);
    FileOutputFormat.setOutputPath(job, partTmp);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class);

    Path partitionFile;
    if (options.getPartitionFileString() == null) {
        partitionFile = new Path(tmpDir + "/_partitons.lst");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        System.out.println("vcf-sort :: Sampling...");
        int numSamples = options.getNumSamples();
        if (fs.getContentSummary(inputPath).getLength() < 10000000) {
            numSamples = 1;
            job.setNumReduceTasks(1);
        }
        InputSampler.writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples,
                        numSamples));

    } else {
        System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ...");
        partitionFile = new Path(options.getPartitionFileString());
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
    }

    if (!job.waitForCompletion(true)) {
        System.err.println("sort :: Job failed.");
        return 1;
    }

    final FileSystem srcFS = partTmp.getFileSystem(conf);
    Path headerPath = new Path(tmpDir + "/header.vcf.gz");
    BGZFCodec bgzfCodec = new BGZFCodec();
    OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath));
    VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
    VariantContextWriter writer;
    writer = builder.setOutputVCFStream(new FilterOutputStream(os) {
        @Override
        public void close() throws IOException {
            this.out.flush();
        }
    }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

    writer.writeHeader(vcfHeader);
    os.close();

    Path outputPath = new Path(options.getOutput());
    final FileSystem dstFS = outputPath.getFileSystem(conf);
    OutputStream vcfgz = dstFS.create(outputPath);
    final FSDataInputStream headerIns = srcFS.open(headerPath);
    IOUtils.copyBytes(headerIns, vcfgz, conf, false);
    headerIns.close();

    final FileStatus[] parts = partTmp.getFileSystem(conf)
            .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*"));
    for (FileStatus p : parts) {
        final FSDataInputStream ins = srcFS.open(p.getPath());
        IOUtils.copyBytes(ins, vcfgz, conf, false);
        ins.close();
    }
    vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    vcfgz.close();
    partTmp.getFileSystem(conf).delete(partTmp, true);
    return 0;
}

From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java

License:Open Source License

protected Job createJob(String[] args) throws Exception {

    Job job = new Job();
    job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis());

    Configuration conf = job.getConfiguration();

    this.setup(args, conf);

    Path input = new Path(this.inputPath);
    FileInputFormat.addInputPath(job, input);
    Path outputPath = new Path(this.outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setInputFormatClass(ArchiveToCDXFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    conf.set("cdx.format", this.cdxFormat);
    conf.set("cdx.hdfs", Boolean.toString(this.hdfs));
    conf.set("cdx.metatag", this.metaTag);
    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // General config:
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(this.numReducers);
    job.setJarByClass(ArchiveCDXGenerator.class);

    // POST directly to the tinycdxserver:
    if (this.cdxserver != null) {
        conf.set("tinycdxserver.endpoint", this.cdxserver);
        conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size);
        // Perform the update in the Map phase (difficult to control number
        // of clients)
        // job.setMapperClass(TinyCDXServerMapper.class);
        // job.setReducerClass(Reducer.class);
        // Perform the update in the reduce phase:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(TinyCDXServerReducer.class);
    } else {/*  www. ja va2s  .  c o m*/
        // Default to the pass-through mapper and reducer:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(Reducer.class);
        // Set up the split:
        if (this.splitFile != null) {
            log.info("Setting splitFile to " + this.splitFile);
            AlphaPartitioner.setPartitionPath(conf, this.splitFile);
            job.setPartitionerClass(AlphaPartitioner.class);
        } else {
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),
                    new Path(outputPath, "_partitions.lst"));
            // FIXME This probably won't work - need to update to recent API
            JobConf jc = new JobConf(conf);
            InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000));
        }
    }

    FileSystem fs = input.getFileSystem(conf);
    FileStatus inputStatus = fs.getFileStatus(input);
    FileInputFormat.setMaxInputSplitSize(job,
            inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf));
    return job;
}