Example usage for org.apache.hadoop.mapred.lib InputSampler writePartitionFile

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred.lib InputSampler writePartitionFile.

Prototype

public static <K, V> void writePartitionFile(JobConf job, Sampler<K, V> sampler)
            throws IOException, ClassNotFoundException, InterruptedException

Source Link

Usage

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *//*from w  ww  .ja va 2 s. com*/
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java

License:Open Source License

protected Job createJob(String[] args) throws Exception {

    Job job = new Job();
    job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis());

    Configuration conf = job.getConfiguration();

    this.setup(args, conf);

    Path input = new Path(this.inputPath);
    FileInputFormat.addInputPath(job, input);
    Path outputPath = new Path(this.outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setInputFormatClass(ArchiveToCDXFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    conf.set("cdx.format", this.cdxFormat);
    conf.set("cdx.hdfs", Boolean.toString(this.hdfs));
    conf.set("cdx.metatag", this.metaTag);
    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // General config:
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(this.numReducers);
    job.setJarByClass(ArchiveCDXGenerator.class);

    // POST directly to the tinycdxserver:
    if (this.cdxserver != null) {
        conf.set("tinycdxserver.endpoint", this.cdxserver);
        conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size);
        // Perform the update in the Map phase (difficult to control number
        // of clients)
        // job.setMapperClass(TinyCDXServerMapper.class);
        // job.setReducerClass(Reducer.class);
        // Perform the update in the reduce phase:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(TinyCDXServerReducer.class);
    } else {//ww w  . j a  v a2s . c  o  m
        // Default to the pass-through mapper and reducer:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(Reducer.class);
        // Set up the split:
        if (this.splitFile != null) {
            log.info("Setting splitFile to " + this.splitFile);
            AlphaPartitioner.setPartitionPath(conf, this.splitFile);
            job.setPartitionerClass(AlphaPartitioner.class);
        } else {
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),
                    new Path(outputPath, "_partitions.lst"));
            // FIXME This probably won't work - need to update to recent API
            JobConf jc = new JobConf(conf);
            InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000));
        }
    }

    FileSystem fs = input.getFileSystem(conf);
    FileStatus inputStatus = fs.getFileStatus(input);
    FileInputFormat.setMaxInputSplitSize(job,
            inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf));
    return job;
}