List of usage examples for org.apache.hadoop.mapred.lib InputSampler writePartitionFile
public static <K, V> void writePartitionFile(JobConf job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException
From source file:com.alexholmes.hadooputils.sort.Sort.java
License:Apache License
/** * The driver for the sort MapReduce job. * * @param jobConf sort configuration * @param numMapTasks number of map tasks * @param numReduceTasks number of reduce tasks * @param sampler sampler, if required * @param codecClass the compression codec for compressing final outputs * @param mapCodecClass the compression codec for compressing intermediary map outputs * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes * for the job output files * @param inputDirAsString input directory in CSV-form * @param outputDirAsString output directory * @return true if the job completed successfully * @throws IOException if something went wrong * @throws URISyntaxException if a URI wasn't correctly formed *//*from w ww .ja va 2 s. com*/ public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks, final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass, final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes, final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException { jobConf.setJarByClass(Sort.class); jobConf.setJobName("sorter"); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (numMapTasks != null) { jobConf.setNumMapTasks(numMapTasks); } if (numReduceTasks != null) { jobConf.setNumReduceTasks(numReduceTasks); } else { int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sort.reduces_per_host"); if (sortReduces != null) { numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(numReduces); } jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(SortReduce.class); jobConf.setInputFormat(SortInputFormat.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); if (mapCodecClass != null) { jobConf.setMapOutputCompressorClass(mapCodecClass); } if (codecClass != null) { jobConf.setBoolean("mapred.output.compress", true); jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class); } FileInputFormat.setInputPaths(jobConf, inputDirAsString); FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString)); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; FileSystem fileSystem = FileSystem.get(jobConf); if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) { inputDir = inputDir.getParent(); } inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + jobConf.getNumReduceTasks() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds."); if (jobResult.isSuccessful()) { if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) { new LzoIndexer(jobConf).index(new Path(outputDirAsString)); } return true; } return false; }
From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java
License:Open Source License
protected Job createJob(String[] args) throws Exception { Job job = new Job(); job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis()); Configuration conf = job.getConfiguration(); this.setup(args, conf); Path input = new Path(this.inputPath); FileInputFormat.addInputPath(job, input); Path outputPath = new Path(this.outputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(ArchiveToCDXFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); conf.set("map.output.key.field.separator", ""); conf.set("cdx.format", this.cdxFormat); conf.set("cdx.hdfs", Boolean.toString(this.hdfs)); conf.set("cdx.metatag", this.metaTag); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); // General config: job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(this.numReducers); job.setJarByClass(ArchiveCDXGenerator.class); // POST directly to the tinycdxserver: if (this.cdxserver != null) { conf.set("tinycdxserver.endpoint", this.cdxserver); conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size); // Perform the update in the Map phase (difficult to control number // of clients) // job.setMapperClass(TinyCDXServerMapper.class); // job.setReducerClass(Reducer.class); // Perform the update in the reduce phase: job.setMapperClass(Mapper.class); job.setReducerClass(TinyCDXServerReducer.class); } else {//ww w . j a v a2s . c o m // Default to the pass-through mapper and reducer: job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); // Set up the split: if (this.splitFile != null) { log.info("Setting splitFile to " + this.splitFile); AlphaPartitioner.setPartitionPath(conf, this.splitFile); job.setPartitionerClass(AlphaPartitioner.class); } else { job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(outputPath, "_partitions.lst")); // FIXME This probably won't work - need to update to recent API JobConf jc = new JobConf(conf); InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000)); } } FileSystem fs = input.getFileSystem(conf); FileStatus inputStatus = fs.getFileStatus(input); FileInputFormat.setMaxInputSplitSize(job, inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf)); return job; }