Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass)

Source Link

Document

Set the key class for the map output data.

Usage

From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunner.java

License:Open Source License

/**
 * // w w  w .j a v  a2s  .  com
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Store application properties where the mappers/reducers can access
    // them
    Config index_conf;
    if (this.configPath != null) {
        index_conf = ConfigFactory.parseFile(new File(this.configPath));
    } else {
        index_conf = ConfigFactory.load();
    }
    if (this.dumpConfig) {
        ConfigPrinter.print(index_conf);
        System.exit(0);
    }
    // Decide whether to apply annotations:
    index_conf = index_conf.withValue(CONFIG_APPLY_ANNOTATIONS,
            ConfigValueFactory.fromAnyRef(applyAnnotations));
    // Store the properties:
    conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
    LOG.info("Loaded warc config.");
    LOG.info(index_conf.getString("warc.title"));
    if (index_conf.getBoolean("warc.solr.use_hash_url_id")) {
        LOG.info("Using hash-based ID.");
    }
    if (index_conf.hasPath("warc.solr.zookeepers")) {
        LOG.info("Using Zookeepers.");
    } else {
        LOG.info("Using SolrServers.");
    }

    // Also set reduce speculative execution off, avoiding duplicate
    // submissions to Solr.
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // Reducer count dependent on concurrent HTTP connections to Solr
    // server.
    int numReducers = 1;
    try {
        numReducers = index_conf.getInt("warc.hadoop.num_reducers");
    } catch (NumberFormatException n) {
        numReducers = 10;
    }

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCIndexerMapper.class);
    conf.setReducerClass(WARCIndexerReducer.class);
    conf.setOutputFormat(KeylessTextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setBoolean("mapred.output.oai-pmh", this.exportXml);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WritableSolrRecord.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXMerger.java

License:Open Source License

/**
 * /*from   ww  w .  j  a  va2 s .co  m*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
public void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    // Input
    conf.setInputFormat(TextInputFormat.class);
    // M-R
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    // Map outputs
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    // Job outputs
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    LOG.info("Used " + numReducers + " reducers.");
    conf.setNumReduceTasks(numReducers);

    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
                                                            // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

}

From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXSeqMerger.java

License:Open Source License

/**
 * /* w  w w .j a va 2  s .c om*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
public void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    // Input
    conf.setInputFormat(SequenceFileInputFormat.class);
    // M-R
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    // Map outputs
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    // Job outputs
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);
    LOG.info("Used " + numReducers + " reducers.");
    conf.setNumReduceTasks(numReducers);

    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
                                                            // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

}

From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCRawStatsMDXGenerator.java

License:Open Source License

/**
 * /*from  ww  w. ja v a  2 s. c  om*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCRawStatsMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);
    // OR TextOutputFormat?
    // conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:voldemort.store.readonly.mr.azkaban.VoldemortBatchIndexJob.java

License:Apache License

/**
 * Method to allow this process to be a instance call from another Job.
 * /*from w w w.  ja v a 2  s.  c o  m*/
 * @storeName to dump the value
 * @inputFile to generate the VFILE
 * 
 * 
 */
public void execute(String voldemortClusterLocalFile, String storeName, String inputPath, String outputPath,
        int voldemortCheckDataPercent) throws IOException, URISyntaxException {
    JobConf conf = createJobConf(VoldemortBatchIndexMapper.class, VoldemortBatchIndexReducer.class);

    try {
        // get the voldemort cluster definition
        // We need to use cluster.xml here where it not yet localized by
        // TaskRunner
        _cluster = HadoopUtils.readCluster(voldemortClusterLocalFile, conf);
    } catch (Exception e) {
        logger.error("Failed to read Voldemort cluster details", e);
        throw new RuntimeException("", e);
    }

    // set the partitioner
    conf.setPartitionerClass(VoldemortBatchIndexPartitoner.class);
    conf.setNumReduceTasks(_cluster.getNumberOfNodes());

    // Blow Away the O/p if force.overwirte is available

    FileInputFormat.setInputPaths(conf, inputPath);

    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if (getProps().getBoolean("force.output.overwrite", false)) {
        FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
        fs.delete(FileOutputFormat.getOutputPath(conf), true);
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(BytesWritable.class);
    conf.setMapOutputValueClass(BytesWritable.class);
    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);

    conf.setNumReduceTasks(_cluster.getNumberOfNodes());

    // get the store information

    conf.setStrings("voldemort.index.filename", storeName + ".index");
    conf.setStrings("voldemort.data.filename", storeName + ".data");
    conf.setInt("input.data.check.percent", voldemortCheckDataPercent);
    conf.setStrings("voldemort.store.name", storeName);

    // run(conf);
    JobClient.runJob(conf);

}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job//from  w w w  .  ja v  a2s .  co m
 */
public void build() {
    try {
        JobConf conf = new JobConf(config);
        conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        conf.set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        conf.setBoolean("save.keys", saveKeys);
        conf.setBoolean("reducer.per.bucket", reducerPerBucket);
        if (!isAvro) {
            conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
            conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(BytesWritable.class);
            conf.setMapOutputValueClass(BytesWritable.class);
            if (reducerPerBucket) {
                conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(HadoopStoreBuilderReducer.class);
            }
        }
        conf.setInputFormat(inputFormatClass);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(BytesWritable.class);
        conf.setOutputValueClass(BytesWritable.class);
        conf.setJarByClass(getClass());
        conf.setReduceSpeculativeExecution(false);
        FileInputFormat.setInputPaths(conf, inputPath);
        conf.set("final.output.dir", outputDir.toString());
        conf.set("checksum.type", CheckSum.toString(checkSumType));
        FileOutputFormat.setOutputPath(conf, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(conf);
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(conf);
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);

        // Derive "rough" number of chunks and reducers
        int numReducers;
        if (saveKeys) {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes),
                        1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
            } else {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks;
            }
        } else {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / chunkSizeBytes), 1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions();
            } else {
                numReducers = cluster.getNumberOfPartitions() * numChunks;
            }
        }
        conf.setInt("num.chunks", numChunks);
        conf.setNumReduceTasks(numReducers);

        if (isAvro) {
            conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
            // conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(ByteBuffer.class);
            conf.setMapOutputValueClass(ByteBuffer.class);

            conf.setInputFormat(inputFormatClass);

            conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
            conf.setOutputKeyClass(ByteBuffer.class);
            conf.setOutputValueClass(ByteBuffer.class);

            // AvroJob confs for the avro mapper
            AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));

            AvroJob.setOutputSchema(conf,
                    Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES)));

            AvroJob.setMapperClass(conf, mapperClass);

            if (reducerPerBucket) {
                conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(AvroStoreBuilderReducer.class);
            }

        }

        logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: "
                + saveKeys + ", reducerPerBucket: " + reducerPerBucket);
        logger.info("Building store...");
        RunningJob job = JobClient.runJob(conf);

        // Once the job has completed log the counter
        Counters counters = job.getCounters();

        if (saveKeys) {
            if (reducerPerBucket) {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            } else {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            }
        }

        // Do a CheckSumOfCheckSum - Similar to HDFS
        CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
        if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) {
            throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType);
        }

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {

            ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();

            if (saveKeys) {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
            } else {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());
            }

            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());

            if (!outputFs.exists(nodePath)) {
                logger.info("No data generated for node " + node.getId() + ". Generating empty folder");
                outputFs.mkdirs(nodePath); // Create empty folder
                outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION));
                logger.info("Setting permission to 755 for " + nodePath);
            }

            if (checkSumType != CheckSumType.NONE) {

                FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {

                    public boolean accept(Path arg0) {
                        if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                            return true;
                        }
                        return false;
                    }
                });

                if (storeFiles != null && storeFiles.length > 0) {
                    Arrays.sort(storeFiles, new IndexFileLastComparator());
                    FSDataInputStream input = null;

                    for (FileStatus file : storeFiles) {
                        try {
                            input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            logger.debug("Checksum for file " + file.toString() + " - "
                                    + new String(Hex.encodeHex(fileCheckSum)));
                            checkSumGenerator.update(fileCheckSum);
                        } catch (Exception e) {
                            logger.error("Error while reading checksum file " + e.getMessage(), e);
                        } finally {
                            if (input != null)
                                input.close();
                        }
                        outputFs.delete(file.getPath(), false);
                    }

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType));

                    String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
                    logger.info("Checksum for node " + node.getId() + " - " + checkSum);

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
                }
            }

            // Write metadata
            Path metadataPath = new Path(nodePath, ".metadata");
            FSDataOutputStream metadataStream = outputFs.create(metadataPath);
            outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION));
            logger.info("Setting permission to 755 for " + metadataPath);
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();

        }

    } catch (Exception e) {
        logger.error("Error in Store builder", e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readwrite.mr.HadoopRWStoreBuilder.java

License:Apache License

/**
 * Run the job/*from w  w w . ja v  a2s .  c  o  m*/
 */
public void build() {
    JobConf conf = new JobConf(config);
    conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
    conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
    conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
    conf.setInt("vector.node.id", this.vectorNodeId);
    conf.setLong("vector.node.version", this.vectorNodeVersion);
    conf.setLong("job.start.time.ms", System.currentTimeMillis());

    conf.setPartitionerClass(HadoopRWStoreBuilderPartitioner.class);

    conf.setInputFormat(inputFormatClass);
    conf.setMapperClass(mapperClass);
    conf.setMapOutputKeyClass(BytesWritable.class);
    conf.setMapOutputValueClass(BytesWritable.class);
    conf.setReducerClass(HadoopRWStoreBuilderReducer.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);
    conf.setReduceSpeculativeExecution(false);

    conf.setJarByClass(getClass());
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, tempPath);

    try {
        // delete the temp dir if it exists
        FileSystem tempFs = tempPath.getFileSystem(conf);
        tempFs.delete(tempPath, true);

        conf.setInt("num.chunks", reducersPerNode);
        int numReducers = cluster.getNumberOfNodes() * reducersPerNode;
        logger.info("Replication factor = " + storeDef.getReplicationFactor() + ", numNodes = "
                + cluster.getNumberOfNodes() + ", reducers per node = " + reducersPerNode + ", numReducers = "
                + numReducers);
        conf.setNumReduceTasks(numReducers);

        logger.info("Building RW store...");
        JobClient.runJob(conf);

    } catch (Exception e) {
        throw new VoldemortException(e);
    }

}