Example usage for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value)

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:voldemort.store.readonly.mr.azkaban.VoldemortBatchIndexJob.java

License:Apache License

/**
 * Method to allow this process to be a instance call from another Job.
 * /* w w w. j  a va 2  s.  c  o  m*/
 * @storeName to dump the value
 * @inputFile to generate the VFILE
 * 
 * 
 */
public void execute(String voldemortClusterLocalFile, String storeName, String inputPath, String outputPath,
        int voldemortCheckDataPercent) throws IOException, URISyntaxException {
    JobConf conf = createJobConf(VoldemortBatchIndexMapper.class, VoldemortBatchIndexReducer.class);

    try {
        // get the voldemort cluster definition
        // We need to use cluster.xml here where it not yet localized by
        // TaskRunner
        _cluster = HadoopUtils.readCluster(voldemortClusterLocalFile, conf);
    } catch (Exception e) {
        logger.error("Failed to read Voldemort cluster details", e);
        throw new RuntimeException("", e);
    }

    // set the partitioner
    conf.setPartitionerClass(VoldemortBatchIndexPartitoner.class);
    conf.setNumReduceTasks(_cluster.getNumberOfNodes());

    // Blow Away the O/p if force.overwirte is available

    FileInputFormat.setInputPaths(conf, inputPath);

    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if (getProps().getBoolean("force.output.overwrite", false)) {
        FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
        fs.delete(FileOutputFormat.getOutputPath(conf), true);
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(BytesWritable.class);
    conf.setMapOutputValueClass(BytesWritable.class);
    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);

    conf.setNumReduceTasks(_cluster.getNumberOfNodes());

    // get the store information

    conf.setStrings("voldemort.index.filename", storeName + ".index");
    conf.setStrings("voldemort.data.filename", storeName + ".data");
    conf.setInt("input.data.check.percent", voldemortCheckDataPercent);
    conf.setStrings("voldemort.store.name", storeName);

    // run(conf);
    JobClient.runJob(conf);

}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job/*from   w  ww  .jav  a2  s .com*/
 */
public void build() {
    try {
        JobConf conf = new JobConf(config);
        conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        conf.set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        conf.setBoolean("save.keys", saveKeys);
        conf.setBoolean("reducer.per.bucket", reducerPerBucket);
        if (!isAvro) {
            conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
            conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(BytesWritable.class);
            conf.setMapOutputValueClass(BytesWritable.class);
            if (reducerPerBucket) {
                conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(HadoopStoreBuilderReducer.class);
            }
        }
        conf.setInputFormat(inputFormatClass);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(BytesWritable.class);
        conf.setOutputValueClass(BytesWritable.class);
        conf.setJarByClass(getClass());
        conf.setReduceSpeculativeExecution(false);
        FileInputFormat.setInputPaths(conf, inputPath);
        conf.set("final.output.dir", outputDir.toString());
        conf.set("checksum.type", CheckSum.toString(checkSumType));
        FileOutputFormat.setOutputPath(conf, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(conf);
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(conf);
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);

        // Derive "rough" number of chunks and reducers
        int numReducers;
        if (saveKeys) {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes),
                        1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
            } else {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks;
            }
        } else {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / chunkSizeBytes), 1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions();
            } else {
                numReducers = cluster.getNumberOfPartitions() * numChunks;
            }
        }
        conf.setInt("num.chunks", numChunks);
        conf.setNumReduceTasks(numReducers);

        if (isAvro) {
            conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
            // conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(ByteBuffer.class);
            conf.setMapOutputValueClass(ByteBuffer.class);

            conf.setInputFormat(inputFormatClass);

            conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
            conf.setOutputKeyClass(ByteBuffer.class);
            conf.setOutputValueClass(ByteBuffer.class);

            // AvroJob confs for the avro mapper
            AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));

            AvroJob.setOutputSchema(conf,
                    Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES)));

            AvroJob.setMapperClass(conf, mapperClass);

            if (reducerPerBucket) {
                conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(AvroStoreBuilderReducer.class);
            }

        }

        logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: "
                + saveKeys + ", reducerPerBucket: " + reducerPerBucket);
        logger.info("Building store...");
        RunningJob job = JobClient.runJob(conf);

        // Once the job has completed log the counter
        Counters counters = job.getCounters();

        if (saveKeys) {
            if (reducerPerBucket) {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            } else {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            }
        }

        // Do a CheckSumOfCheckSum - Similar to HDFS
        CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
        if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) {
            throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType);
        }

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {

            ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();

            if (saveKeys) {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
            } else {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());
            }

            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());

            if (!outputFs.exists(nodePath)) {
                logger.info("No data generated for node " + node.getId() + ". Generating empty folder");
                outputFs.mkdirs(nodePath); // Create empty folder
                outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION));
                logger.info("Setting permission to 755 for " + nodePath);
            }

            if (checkSumType != CheckSumType.NONE) {

                FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {

                    public boolean accept(Path arg0) {
                        if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                            return true;
                        }
                        return false;
                    }
                });

                if (storeFiles != null && storeFiles.length > 0) {
                    Arrays.sort(storeFiles, new IndexFileLastComparator());
                    FSDataInputStream input = null;

                    for (FileStatus file : storeFiles) {
                        try {
                            input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            logger.debug("Checksum for file " + file.toString() + " - "
                                    + new String(Hex.encodeHex(fileCheckSum)));
                            checkSumGenerator.update(fileCheckSum);
                        } catch (Exception e) {
                            logger.error("Error while reading checksum file " + e.getMessage(), e);
                        } finally {
                            if (input != null)
                                input.close();
                        }
                        outputFs.delete(file.getPath(), false);
                    }

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType));

                    String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
                    logger.info("Checksum for node " + node.getId() + " - " + checkSum);

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
                }
            }

            // Write metadata
            Path metadataPath = new Path(nodePath, ".metadata");
            FSDataOutputStream metadataStream = outputFs.create(metadataPath);
            outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION));
            logger.info("Setting permission to 755 for " + metadataPath);
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();

        }

    } catch (Exception e) {
        logger.error("Error in Store builder", e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readwrite.mr.HadoopRWStoreBuilder.java

License:Apache License

/**
 * Run the job//from   w  ww  .  j a  v a  2  s  .c  om
 */
public void build() {
    JobConf conf = new JobConf(config);
    conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
    conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
    conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
    conf.setInt("vector.node.id", this.vectorNodeId);
    conf.setLong("vector.node.version", this.vectorNodeVersion);
    conf.setLong("job.start.time.ms", System.currentTimeMillis());

    conf.setPartitionerClass(HadoopRWStoreBuilderPartitioner.class);

    conf.setInputFormat(inputFormatClass);
    conf.setMapperClass(mapperClass);
    conf.setMapOutputKeyClass(BytesWritable.class);
    conf.setMapOutputValueClass(BytesWritable.class);
    conf.setReducerClass(HadoopRWStoreBuilderReducer.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);
    conf.setReduceSpeculativeExecution(false);

    conf.setJarByClass(getClass());
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, tempPath);

    try {
        // delete the temp dir if it exists
        FileSystem tempFs = tempPath.getFileSystem(conf);
        tempFs.delete(tempPath, true);

        conf.setInt("num.chunks", reducersPerNode);
        int numReducers = cluster.getNumberOfNodes() * reducersPerNode;
        logger.info("Replication factor = " + storeDef.getReplicationFactor() + ", numNodes = "
                + cluster.getNumberOfNodes() + ", reducers per node = " + reducersPerNode + ", numReducers = "
                + numReducers);
        conf.setNumReduceTasks(numReducers);

        logger.info("Building RW store...");
        JobClient.runJob(conf);

    } catch (Exception e) {
        throw new VoldemortException(e);
    }

}

From source file:wikiduper.wikipedia.RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from   w  ww.  j ava2s.co m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    JobConf conf = new JobConf(getConf(), RepackWikipedia.class);
    conf.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    conf.set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    int mapTasks = 10;

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    SequenceFileInputFormat.addInputPath(conf, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if ("none".equals(compressionType)) {
        SequenceFileOutputFormat.setCompressOutput(conf, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(conf, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
            conf.setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language));

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}