List of usage examples for org.apache.hadoop.mapred JobConf setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> theClass)
From source file:org.terrier.structures.indexing.CompressingMetaIndexBuilder.java
License:Mozilla Public License
/** * reverseAsMapReduceJob/*from w w w . ja va 2 s .com*/ * @param index * @param structureName * @param keys * @param jf * @throws Exception */ //@SuppressWarnings("deprecation") public static void reverseAsMapReduceJob(IndexOnDisk index, String structureName, String[] keys, HadoopPlugin.JobFactory jf) throws Exception { long time = System.currentTimeMillis(); final JobConf conf = jf.newJob(); conf.setJobName("Reverse MetaIndex"); conf.setMapOutputKeyClass(KeyValueTuple.class); conf.setMapOutputValueClass(IntWritable.class); conf.setMapperClass(MapperReducer.class); conf.setReducerClass(MapperReducer.class); conf.setNumReduceTasks(keys.length); conf.setPartitionerClass(KeyedPartitioner.class); conf.setInputFormat(CompressingMetaIndexInputFormat.class); conf.setReduceSpeculativeExecution(false); conf.set("MetaIndexInputStreamRecordReader.structureName", structureName); conf.setInt("CompressingMetaIndexBuilder.reverse.keyCount", keys.length); conf.set("CompressingMetaIndexBuilder.reverse.keys", ArrayUtils.join(keys, ",")); conf.set("CompressingMetaIndexBuilder.forward.valueLengths", index.getIndexProperty("index." + structureName + ".value-lengths", "")); conf.set("CompressingMetaIndexBuilder.forward.keys", index.getIndexProperty("index." + structureName + ".key-names", "")); FileOutputFormat.setOutputPath(conf, new Path(index.getPath())); HadoopUtility.toHConfiguration(index, conf); conf.setOutputFormat(NullOutputFormat.class); try { RunningJob rj = JobClient.runJob(conf); rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { throw new Exception("Problem running job to reverse metadata", e); } //only update the index from the controlling process, so that we dont have locking/concurrency issues index.setIndexProperty("index." + structureName + ".reverse-key-names", ArrayUtils.join(keys, ",")); index.flush(); logger.info("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); }
From source file:org.weikey.terasort.TeraSort.java
License:Apache License
@SuppressWarnings("deprecation") public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); SortConfig sortConfig = new SortConfig(job); // if (args.length >= 3) { // job.setNumReduceTasks(Integer.valueOf(args[2])); // if (args.length >= 4) { // sortConfig.setStartKey(Integer.valueOf(args[3])); // if (args.length >= 5) { // sortConfig.setFieldSeparator(args[4]); // }/* w ww . java 2 s . co m*/ // } // } Integer numMapTasks = null; Integer numReduceTasks = null; List<String> otherArgs = new ArrayList<String>(); boolean createLzopIndex = false; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { job.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-f".equals(args[i]) || "--ignore-case".equals(args[i])) { sortConfig.setIgnoreCase(true); } else if ("-u".equals(args[i]) || "--unique".equals(args[i])) { sortConfig.setUnique(true); } else if ("-k".equals(args[i]) || "--key".equals(args[i])) { String[] parts = StringUtils.split(args[++i], ","); sortConfig.setStartKey(Integer.valueOf(parts[0])); if (parts.length > 1) { sortConfig.setEndKey(Integer.valueOf(parts[1])); } } else if ("-t".equals(args[i]) || "--field-separator".equals(args[i])) { sortConfig.setFieldSeparator(args[++i]); } else if ("--total-order".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) { maxSplits = Integer.MAX_VALUE; } } else if ("--lzop-index".equals(args[i])) { createLzopIndex = true; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job); LOG.info("done"); return 0; }
From source file:source.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", getOutputReplication(job)); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job);/*from w w w .j a va 2 s. c o m*/ LOG.info("done"); return 0; }
From source file:ucsc.hadoop.mapreduce.apache.Sort.java
License:Apache License
/** * The main driver for sort program./* w ww .jav a 2s . com*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:voldemort.store.readonly.mr.azkaban.VoldemortBatchIndexJob.java
License:Apache License
/** * Method to allow this process to be a instance call from another Job. * //from w w w .java2 s .co m * @storeName to dump the value * @inputFile to generate the VFILE * * */ public void execute(String voldemortClusterLocalFile, String storeName, String inputPath, String outputPath, int voldemortCheckDataPercent) throws IOException, URISyntaxException { JobConf conf = createJobConf(VoldemortBatchIndexMapper.class, VoldemortBatchIndexReducer.class); try { // get the voldemort cluster definition // We need to use cluster.xml here where it not yet localized by // TaskRunner _cluster = HadoopUtils.readCluster(voldemortClusterLocalFile, conf); } catch (Exception e) { logger.error("Failed to read Voldemort cluster details", e); throw new RuntimeException("", e); } // set the partitioner conf.setPartitionerClass(VoldemortBatchIndexPartitoner.class); conf.setNumReduceTasks(_cluster.getNumberOfNodes()); // Blow Away the O/p if force.overwirte is available FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (getProps().getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setNumReduceTasks(_cluster.getNumberOfNodes()); // get the store information conf.setStrings("voldemort.index.filename", storeName + ".index"); conf.setStrings("voldemort.data.filename", storeName + ".data"); conf.setInt("input.data.check.percent", voldemortCheckDataPercent); conf.setStrings("voldemort.store.name", storeName); // run(conf); JobClient.runJob(conf); }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java
License:Apache License
/** * Run the job/*from w ww . ja v a 2 s . com*/ */ public void build() { try { JobConf conf = new JobConf(config); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean("save.keys", saveKeys); conf.setBoolean("reducer.per.bucket", reducerPerBucket); if (!isAvro) { conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); if (reducerPerBucket) { conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(HadoopStoreBuilderReducer.class); } } conf.setInputFormat(inputFormatClass); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set("checksum.type", CheckSum.toString(checkSumType)); FileOutputFormat.setOutputPath(conf, tempDir); FileSystem outputFs = outputDir.getFileSystem(conf); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(conf); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes); // Derive "rough" number of chunks and reducers int numReducers; if (saveKeys) { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor(); } else { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks; } } else { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions(); } else { numReducers = cluster.getNumberOfPartitions() * numChunks; } } conf.setInt("num.chunks", numChunks); conf.setNumReduceTasks(numReducers); if (isAvro) { conf.setPartitionerClass(AvroStoreBuilderPartitioner.class); // conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(ByteBuffer.class); conf.setMapOutputValueClass(ByteBuffer.class); conf.setInputFormat(inputFormatClass); conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class); conf.setOutputKeyClass(ByteBuffer.class); conf.setOutputValueClass(ByteBuffer.class); // AvroJob confs for the avro mapper AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema"))); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES))); AvroJob.setMapperClass(conf, mapperClass); if (reducerPerBucket) { conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(AvroStoreBuilderReducer.class); } } logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: " + saveKeys + ", reducerPerBucket: " + reducerPerBucket); logger.info("Building store..."); RunningJob job = JobClient.runJob(conf); // Once the job has completed log the counter Counters counters = job.getCounters(); if (saveKeys) { if (reducerPerBucket) { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } else { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } } // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType); } // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); } else { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); } Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { logger.info("No data generated for node " + node.getId() + ". Generating empty folder"); outputFs.mkdirs(nodePath); // Create empty folder outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + nodePath); } if (checkSumType != CheckSumType.NONE) { FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null && storeFiles.length > 0) { Arrays.sort(storeFiles, new IndexFileLastComparator()); FSDataInputStream input = null; for (FileStatus file : storeFiles) { try { input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); logger.debug("Checksum for file " + file.toString() + " - " + new String(Hex.encodeHex(fileCheckSum))); checkSumGenerator.update(fileCheckSum); } catch (Exception e) { logger.error("Error while reading checksum file " + e.getMessage(), e); } finally { if (input != null) input.close(); } outputFs.delete(file.getPath(), false); } metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum())); logger.info("Checksum for node " + node.getId() + " - " + checkSum); metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum); } } // Write metadata Path metadataPath = new Path(nodePath, ".metadata"); FSDataOutputStream metadataStream = outputFs.create(metadataPath); outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + metadataPath); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } } catch (Exception e) { logger.error("Error in Store builder", e); throw new VoldemortException(e); } }
From source file:voldemort.store.readwrite.mr.HadoopRWStoreBuilder.java
License:Apache License
/** * Run the job/*from ww w.j a v a 2 s.c o m*/ */ public void build() { JobConf conf = new JobConf(config); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setInt("vector.node.id", this.vectorNodeId); conf.setLong("vector.node.version", this.vectorNodeVersion); conf.setLong("job.start.time.ms", System.currentTimeMillis()); conf.setPartitionerClass(HadoopRWStoreBuilderPartitioner.class); conf.setInputFormat(inputFormatClass); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setReducerClass(HadoopRWStoreBuilderReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setReduceSpeculativeExecution(false); conf.setJarByClass(getClass()); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, tempPath); try { // delete the temp dir if it exists FileSystem tempFs = tempPath.getFileSystem(conf); tempFs.delete(tempPath, true); conf.setInt("num.chunks", reducersPerNode); int numReducers = cluster.getNumberOfNodes() * reducersPerNode; logger.info("Replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", reducers per node = " + reducersPerNode + ", numReducers = " + numReducers); conf.setNumReduceTasks(numReducers); logger.info("Building RW store..."); JobClient.runJob(conf); } catch (Exception e) { throw new VoldemortException(e); } }