List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXSeqMerger.java
License:Open Source License
/** * /*from w w w.j av a 2s. c o m*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ public void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); // Input conf.setInputFormat(SequenceFileInputFormat.class); // M-R conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); // Map outputs conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); // Job outputs conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); LOG.info("Used " + numReducers + " reducers."); conf.setNumReduceTasks(numReducers); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); }
From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCRawStatsMDXGenerator.java
License:Open Source License
/** * /*from w w w . j a va2 s . c o m*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCRawStatsMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); conf.setOutputFormat(TextOutputFormat.class); // OR TextOutputFormat? // conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java
License:Apache License
/** * Run the job/* w ww . j a v a 2 s . c om*/ */ public void build() { try { JobConf conf = new JobConf(config); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean("save.keys", saveKeys); conf.setBoolean("reducer.per.bucket", reducerPerBucket); if (!isAvro) { conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); if (reducerPerBucket) { conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(HadoopStoreBuilderReducer.class); } } conf.setInputFormat(inputFormatClass); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set("checksum.type", CheckSum.toString(checkSumType)); FileOutputFormat.setOutputPath(conf, tempDir); FileSystem outputFs = outputDir.getFileSystem(conf); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(conf); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes); // Derive "rough" number of chunks and reducers int numReducers; if (saveKeys) { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor(); } else { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks; } } else { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions(); } else { numReducers = cluster.getNumberOfPartitions() * numChunks; } } conf.setInt("num.chunks", numChunks); conf.setNumReduceTasks(numReducers); if (isAvro) { conf.setPartitionerClass(AvroStoreBuilderPartitioner.class); // conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(ByteBuffer.class); conf.setMapOutputValueClass(ByteBuffer.class); conf.setInputFormat(inputFormatClass); conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class); conf.setOutputKeyClass(ByteBuffer.class); conf.setOutputValueClass(ByteBuffer.class); // AvroJob confs for the avro mapper AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema"))); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES))); AvroJob.setMapperClass(conf, mapperClass); if (reducerPerBucket) { conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(AvroStoreBuilderReducer.class); } } logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: " + saveKeys + ", reducerPerBucket: " + reducerPerBucket); logger.info("Building store..."); RunningJob job = JobClient.runJob(conf); // Once the job has completed log the counter Counters counters = job.getCounters(); if (saveKeys) { if (reducerPerBucket) { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } else { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } } // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType); } // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); } else { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); } Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { logger.info("No data generated for node " + node.getId() + ". Generating empty folder"); outputFs.mkdirs(nodePath); // Create empty folder outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + nodePath); } if (checkSumType != CheckSumType.NONE) { FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null && storeFiles.length > 0) { Arrays.sort(storeFiles, new IndexFileLastComparator()); FSDataInputStream input = null; for (FileStatus file : storeFiles) { try { input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); logger.debug("Checksum for file " + file.toString() + " - " + new String(Hex.encodeHex(fileCheckSum))); checkSumGenerator.update(fileCheckSum); } catch (Exception e) { logger.error("Error while reading checksum file " + e.getMessage(), e); } finally { if (input != null) input.close(); } outputFs.delete(file.getPath(), false); } metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum())); logger.info("Checksum for node " + node.getId() + " - " + checkSum); metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum); } } // Write metadata Path metadataPath = new Path(nodePath, ".metadata"); FSDataOutputStream metadataStream = outputFs.create(metadataPath); outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + metadataPath); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } } catch (Exception e) { logger.error("Error in Store builder", e); throw new VoldemortException(e); } }