Example usage for org.apache.hadoop.mapred.lib InputSampler writePartitionFile

List of usage examples for org.apache.hadoop.mapred.lib InputSampler writePartitionFile

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred.lib InputSampler writePartitionFile.

Prototype

public static <K, V> void writePartitionFile(JobConf job, Sampler<K, V> sampler)
            throws IOException, ClassNotFoundException, InterruptedException 

Source Link

Usage

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *//*from w  ww  .ja va 2 s. com*/
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java

License:Open Source License

protected Job createJob(String[] args) throws Exception {

    Job job = new Job();
    job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis());

    Configuration conf = job.getConfiguration();

    this.setup(args, conf);

    Path input = new Path(this.inputPath);
    FileInputFormat.addInputPath(job, input);
    Path outputPath = new Path(this.outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setInputFormatClass(ArchiveToCDXFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    conf.set("cdx.format", this.cdxFormat);
    conf.set("cdx.hdfs", Boolean.toString(this.hdfs));
    conf.set("cdx.metatag", this.metaTag);
    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // General config:
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(this.numReducers);
    job.setJarByClass(ArchiveCDXGenerator.class);

    // POST directly to the tinycdxserver:
    if (this.cdxserver != null) {
        conf.set("tinycdxserver.endpoint", this.cdxserver);
        conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size);
        // Perform the update in the Map phase (difficult to control number
        // of clients)
        // job.setMapperClass(TinyCDXServerMapper.class);
        // job.setReducerClass(Reducer.class);
        // Perform the update in the reduce phase:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(TinyCDXServerReducer.class);
    } else {//ww w  . j a  v a2s . c  o  m
        // Default to the pass-through mapper and reducer:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(Reducer.class);
        // Set up the split:
        if (this.splitFile != null) {
            log.info("Setting splitFile to " + this.splitFile);
            AlphaPartitioner.setPartitionPath(conf, this.splitFile);
            job.setPartitionerClass(AlphaPartitioner.class);
        } else {
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),
                    new Path(outputPath, "_partitions.lst"));
            // FIXME This probably won't work - need to update to recent API
            JobConf jc = new JobConf(conf);
            InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000));
        }
    }

    FileSystem fs = input.getFileSystem(conf);
    FileStatus inputStatus = fs.getFileStatus(input);
    FileInputFormat.setMaxInputSplitSize(job,
            inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf));
    return job;
}