List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java
License:Open Source License
protected Job createJob(String[] args) throws Exception { Job job = new Job(); job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis()); Configuration conf = job.getConfiguration(); this.setup(args, conf); Path input = new Path(this.inputPath); FileInputFormat.addInputPath(job, input); Path outputPath = new Path(this.outputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(ArchiveToCDXFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); conf.set("map.output.key.field.separator", ""); conf.set("cdx.format", this.cdxFormat); conf.set("cdx.hdfs", Boolean.toString(this.hdfs)); conf.set("cdx.metatag", this.metaTag); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); // General config: job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(this.numReducers); job.setJarByClass(ArchiveCDXGenerator.class); // POST directly to the tinycdxserver: if (this.cdxserver != null) { conf.set("tinycdxserver.endpoint", this.cdxserver); conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size); // Perform the update in the Map phase (difficult to control number // of clients) // job.setMapperClass(TinyCDXServerMapper.class); // job.setReducerClass(Reducer.class); // Perform the update in the reduce phase: job.setMapperClass(Mapper.class); job.setReducerClass(TinyCDXServerReducer.class); } else {// w w w. j a va 2s . c om // Default to the pass-through mapper and reducer: job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); // Set up the split: if (this.splitFile != null) { log.info("Setting splitFile to " + this.splitFile); AlphaPartitioner.setPartitionPath(conf, this.splitFile); job.setPartitionerClass(AlphaPartitioner.class); } else { job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(outputPath, "_partitions.lst")); // FIXME This probably won't work - need to update to recent API JobConf jc = new JobConf(conf); InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000)); } } FileSystem fs = input.getFileSystem(conf); FileStatus inputStatus = fs.getFileStatus(input); FileInputFormat.setMaxInputSplitSize(job, inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf)); return job; }
From source file:uk.gov.gchq.gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java
License:Apache License
private void setUpPartitionerGenerateSplitsFile(final Job job, final AddElementsFromHdfs operation, final AccumuloStore store) throws IOException { final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE_PATH); LOGGER.info("Creating splits file in location {} from table {}", splitsFilePath, store.getProperties().getTable()); final int maxReducers = intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS); final int minReducers = intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS); if (maxReducers != -1 && minReducers != -1) { if (minReducers > maxReducers) { LOGGER.error(/*from ww w . j a v a 2 s .co m*/ "Minimum number of reducers must be less than the maximum number of reducers: minimum was {} " + "maximum was {}", minReducers, maxReducers); throw new IOException( "Minimum number of reducers must be less than the maximum number of reducers"); } } int numSplits; try { if (maxReducers == -1) { numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(), FileSystem.get(job.getConfiguration()), new Path(splitsFilePath)); } else { numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(), FileSystem.get(job.getConfiguration()), new Path(splitsFilePath), maxReducers - 1); } } catch (final StoreException e) { throw new RuntimeException(e.getMessage(), e); } int numReducers = numSplits + 1; LOGGER.info("Number of splits is {}; number of reducers is {}", numSplits, numReducers); // If neither min or max are specified then nothing to do; if max specified and min not then already taken care of. // If min is specified and the number of reducers is not greater than that then set the appropriate number of // subbins. if (minReducers != -1) { if (numReducers < minReducers) { LOGGER.info("Number of reducers is {} which is less than the specified minimum number of {}", numReducers, minReducers); int factor = (minReducers / numReducers) + 1; LOGGER.info("Setting number of subbins on KeyRangePartitioner to {}", factor); KeyRangePartitioner.setNumSubBins(job, factor); numReducers = numReducers * factor; LOGGER.info("Number of reducers is {}", numReducers); } } job.setNumReduceTasks(numReducers); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsFilePath); }
From source file:uk.gov.gchq.gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java
License:Apache License
private void setUpPartitionerFromUserProvidedSplitsFile(final Job job, final AddElementsFromHdfs operation) throws IOException { final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE_PATH); if (intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS) != -1 || intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS) != -1) { LOGGER.info("Using splits file provided by user {}, ignoring options {} and {}", splitsFilePath, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS); } else {//from ww w . jav a 2s . c om LOGGER.info("Using splits file provided by user {}", splitsFilePath); } final int numSplits = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()), new Path(splitsFilePath)); job.setNumReduceTasks(numSplits + 1); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsFilePath); }
From source file:validpartitionnacluster.ValidPartitionNacluster.java
public int run(String[] allArgs) throws Exception { String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs(); Job job = Job.getInstance(getConf()); job.setJarByClass(ValidPartitionNacluster.class); // necessrio(corrigido) job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(PartitionMapper.class); job.setPartitionerClass(AgePartitioner.class); job.setReducerClass(ParitionReducer.class); //Number of Reducer tasks. job.setNumReduceTasks(4);/*from www . j a va 2 s.c om*/ job.setJobName("Partition for Machine Count"); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(job, new Path(args[0])); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); return 0; }
From source file:voldemort.store.readonly.mapreduce.HadoopStoreBuilder.java
License:Apache License
/** * Run the job/* ww w . j a v a 2 s . c om*/ */ public void build() { try { Job job = new Job(config); job.getConfiguration().setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); job.getConfiguration().set("cluster.xml", new ClusterMapper().writeCluster(cluster)); job.getConfiguration().set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); job.getConfiguration().setBoolean("save.keys", saveKeys); job.getConfiguration().set("final.output.dir", outputDir.toString()); job.getConfiguration().set("checksum.type", CheckSum.toString(checkSumType)); job.setPartitionerClass(HadoopStoreBuilderPartitioner.class); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); job.setReducerClass(HadoopStoreBuilderReducer.class); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setJarByClass(getClass()); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, tempDir); FileSystem outputFs = outputDir.getFileSystem(job.getConfiguration()); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(job.getConfiguration()); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); int numChunks = Math.max( (int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfNodes() / chunkSizeBytes), 1); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes + ", num.chunks = " + numChunks); job.getConfiguration().setInt("num.chunks", numChunks); int numReduces = cluster.getNumberOfNodes() * numChunks; job.setNumReduceTasks(numReduces); logger.info("Number of reduces: " + numReduces); logger.info("Building store..."); job.waitForCompletion(true); ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); else metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { outputFs.mkdirs(nodePath); // Create empty folder } // Write metadata FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata")); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } if (checkSumType != CheckSumType.NONE) { // Generate checksum for every node FileStatus[] nodes = outputFs.listStatus(outputDir); // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digests"); } for (FileStatus node : nodes) { if (node.isDir()) { FileStatus[] storeFiles = outputFs.listStatus(node.getPath(), new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null) { Arrays.sort(storeFiles, new IndexFileLastComparator()); for (FileStatus file : storeFiles) { FSDataInputStream input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); checkSumGenerator.update(fileCheckSum); outputFs.delete(file.getPath(), true); } FSDataOutputStream checkSumStream = outputFs.create( new Path(node.getPath(), CheckSum.toString(checkSumType) + "checkSum.txt")); checkSumStream.write(checkSumGenerator.getCheckSum()); checkSumStream.flush(); checkSumStream.close(); } } } } } catch (Exception e) { logger.error("Error = " + e); throw new VoldemortException(e); } }