Example usage for org.apache.hadoop.mapred JobConf setJarByClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setJarByClass.

Prototype

public void setJarByClass(Class cls)

Source Link

Document

Set the job's jar file by finding an example class location.

Usage

From source file:org.slc.sli.aggregation.mapreduce.map.ConfigurableMapReduceJob.java

License:Apache License

@SuppressWarnings("rawtypes")
protected static JobConf finalizeConfig(JobConf jobConf, ConfigSections s) throws IOException {

    Class<? extends Mapper> mapperClass = JobConfiguration.mapper.getMapClass(mapper);
    Class<? extends Reducer> reducerClass = JobConfiguration.function.getReduceClass(reduceFunction);
    Map<String, String> idFields = s.getMapper().getMapIdFields();

    // validate we have enough to continue
    boolean valid = true;
    if (mapperClass == null) {
        log.severe("Invalid map/reduce configuration detected : no mapper class specified.");
        valid = false;/*www  . j  a  v a2s  .co  m*/
    }
    if (idFields == null) {
        idFields = new HashMap<String, String>();
        log.severe("Invalid map/reduce configuration detected : no map id fields specified.");
        valid = false;
    }
    if (mapCollection == null) {
        log.severe("Invalid map/reduce configuration detected : no map collection specified.");
        valid = false;
    }
    if (mapQuery == null) {
        log.severe("Invalid map/reduce configuration detected : no map query specified.");
        valid = false;
    }
    if (mapFields == null) {
        log.severe("Invalid map/reduce configuration detected : no map input fields specified.");
        valid = false;
    }
    if (reducerClass == null) {
        log.severe("Invalid map/reduce configuration detected : no reducer class specified.");
        valid = false;
    }
    if (reduceCollection == null) {
        log.severe("Invalid map/reduce configuration detected : no reduce collection specified.");
        valid = false;
    }
    if (reduceField == null) {
        log.severe("Invalid map/reduce configuration detected : no reduce field specified.");
        valid = false;
    }

    if (!valid) {
        throw new IllegalArgumentException("Invalid mapper specified. Check log for details.");
    }

    jobConf.set("mapred.output.dir", String.format("%s-%s-%d", s.getMapper().getMapper(),
            s.getMetadata().getFunction(), System.currentTimeMillis()));

    jobConf.setJobName(s.getMetadata().getDescription() == null ? "M/R Job" : s.getMetadata().getDescription());

    // enable speculative execution. Multiple mapper tasks are created for the same split.
    // First one to finish wins; the remaining tasks are terminated.
    jobConf.setSpeculativeExecution(true);
    jobConf.setUseNewMapper(true);
    jobConf.setUseNewReducer(true);

    /**
     * TODO -- decide if this is required.
    String id = conf.get("@ID@");
    String tenantId = conf.get("@TENANT_ID@");
    for (Map.Entry<String, Object> entry : query.entrySet()) {
    Object value = entry.getValue();
    if (value instanceof String) {
        String s = (String) value;
        if (s.indexOf("@ID@") >= 0 && id != null) {
            s = s.replace("@ID@", id);
            query.put(entry.getKey(), s);
        }
        if (s.indexOf("@TENANT_ID@") >= 0 && tenantId != null) {
            s = s.replace("@TENANT_ID@", tenantId);
            query.put(entry.getKey(), s);
        }
    }
    }
            
    if (updateField.indexOf("@ID@") >= 0 && id != null) {
    updateField = updateField.replace("@ID@", id);
    }
    if (updateField.indexOf("@TENANT_ID@") >= 0 && tenantId != null) {
    updateField = updateField.replace("@TENANT_ID@", tenantId);
    }
    */

    MongoConfigUtil.setQuery(jobConf, new BasicDBObject(mapQuery));

    Map<String, Object> fullFields = new HashMap<String, Object>();
    for (String f : idFields.values()) {
        fullFields.put(f, 1);
    }
    fullFields.putAll(mapFields);

    MongoConfigUtil.setFields(jobConf, new BasicDBObject(fullFields));
    MongoConfigUtil.setInputKey(jobConf, idFields.get("id"));
    MongoConfigUtil.setInputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + mapCollection);
    MongoConfigUtil.setMapperOutputKey(jobConf, TenantAndIdEmittableKey.class);
    MongoConfigUtil.setMapperOutputValue(jobConf, BSONWritable.class);
    MongoConfigUtil.setOutputKey(jobConf, TenantAndIdEmittableKey.class);
    MongoConfigUtil.setOutputValue(jobConf, BSONWritable.class);

    // TODO - this probably should be configurable
    MongoConfigUtil.setReadSplitsFromSecondary(jobConf, true);

    MongoConfigUtil.setSplitSize(jobConf, 32);

    jobConf.setClass("mapred.input.key.class", TenantAndIdEmittableKey.class, EmittableKey.class);
    jobConf.setClass("mapred.input.value.class", BSONWritable.class, Object.class);

    jobConf.setClass("mapred.output.key.class", TenantAndIdEmittableKey.class, EmittableKey.class);
    jobConf.setClass("mapred.output.value.class", BSONWritable.class, Object.class);

    jobConf.setClass("mapreduce.inputformat.class", MongoTenantAndIdInputFormat.class, MongoInputFormat.class);
    jobConf.setClass("mapreduce.outputformat.class", MongoAggFormatter.class, MongoOutputFormat.class);
    MongoConfigUtil.setInputFormat(jobConf, MongoTenantAndIdInputFormat.class);
    MongoConfigUtil.setOutputFormat(jobConf, MongoAggFormatter.class);

    /**
     * Configure how hadoop calculates splits.
     *
     * We enable input splits to avoid having the entire job executed on a single hadoop node.
     *
     * We enable shard chunk splitting to allow mongo to specify how to split the input.
     *
     * We disable read splits from shards because we want hadoop connecting to mongos, not
     * mongod directly. This avoids incorrect results in situations where data is in the process
     * of migration at the same time hadoop is trying to read it.
     *
     * TODO - determine if we also need to set the input split key pattern. This depends
     * on how well data is distributed by _id. Setting the key pattern gives finer grained
     * control over how splits are calculated.
     */
    MongoConfigUtil.setCreateInputSplits(jobConf, true);
    MongoConfigUtil.setShardChunkSplittingEnabled(jobConf, true);
    MongoConfigUtil.setReadSplitsFromShards(jobConf, false);

    MongoConfigUtil.setOutputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + reduceCollection);

    jobConf.setJarByClass(JobConfiguration.class);

    MongoConfigUtil.setMapper(jobConf, mapperClass);
    jobConf.setClass(JobContext.MAP_CLASS_ATTR, mapperClass, Mapper.class);

    MongoConfigUtil.setReducer(jobConf, reducerClass);
    jobConf.setClass(JobContext.REDUCE_CLASS_ATTR, reducerClass, Reducer.class);

    // Set this relatively high to keep the total map execution time low.
    // Formula:  1.75 * (# nodes * max tasks)
    // TODO : replace this hardcoded value with one calculated from configuration information.
    jobConf.setNumReduceTasks(52);

    // Add the configuration itself to the JobConf.
    JobConfiguration.toHadoopConfiguration(s, jobConf);

    return jobConf;
}

From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraGenJob.java

License:Apache License

@SuppressWarnings({ "ProhibitedExceptionDeclared" })
@Override//ww  w  . jav  a2s .  c o  m
public RunningJob runJob(String[] args) throws Exception {
    JobConf job = (JobConf) getConf();
    setNumberOfRows(job, Long.parseLong(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraGen");
    job.setJarByClass(TeraGenJob.class);
    job.setMapperClass(TeraGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraGenRangeInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setBoolean(ClusterConstants.MAPRED_DISABLE_TOOL_WARNING, true);
    return JobClient.runJob(job);
}

From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraSortJob.java

License:Apache License

@SuppressWarnings("ProhibitedExceptionDeclared")
@Override//from w w  w  . j  av a 2  s  .c o  m
public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraConstants.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraConstants.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSortJob.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setBoolean(ClusterConstants.MAPRED_DISABLE_TOOL_WARNING, true);

    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    job.setInt("mapred.submit.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    RunningJob runningJob = JobClient.runJob(job);
    LOG.info("done");
    return 0;
}

From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraValidateJob.java

License:Apache License

public int run(String[] args) throws Exception {
    JobConf job = (JobConf) getConf();
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraValidate");
    job.setJarByClass(TeraValidateJob.class);
    job.setMapperClass(ValidateMapper.class);
    job.setReducerClass(ValidateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    // force a single reducer
    job.setNumReduceTasks(1);/*from ww  w .j ava 2  s  . c om*/
    // force a single split 
    job.setLong("mapred.min.split.size", Long.MAX_VALUE);
    job.setInputFormat(TeraInputFormat.class);
    JobClient.runJob(job);
    return 0;
}

From source file:org.weikey.terasort.TeraSort.java

License:Apache License

@SuppressWarnings("deprecation")
public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    SortConfig sortConfig = new SortConfig(job);
    // if (args.length >= 3) {
    // job.setNumReduceTasks(Integer.valueOf(args[2]));
    // if (args.length >= 4) {
    // sortConfig.setStartKey(Integer.valueOf(args[3]));
    // if (args.length >= 5) {
    // sortConfig.setFieldSeparator(args[4]);
    // }/*from w w w  . ja  v  a 2  s  . c  om*/
    // }
    // }

    Integer numMapTasks = null;
    Integer numReduceTasks = null;

    List<String> otherArgs = new ArrayList<String>();
    boolean createLzopIndex = false;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                job.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-f".equals(args[i]) || "--ignore-case".equals(args[i])) {
                sortConfig.setIgnoreCase(true);
            } else if ("-u".equals(args[i]) || "--unique".equals(args[i])) {
                sortConfig.setUnique(true);
            } else if ("-k".equals(args[i]) || "--key".equals(args[i])) {
                String[] parts = StringUtils.split(args[++i], ",");
                sortConfig.setStartKey(Integer.valueOf(parts[0]));
                if (parts.length > 1) {
                    sortConfig.setEndKey(Integer.valueOf(parts[1]));
                }
            } else if ("-t".equals(args[i]) || "--field-separator".equals(args[i])) {
                sortConfig.setFieldSeparator(args[++i]);
            } else if ("--total-order".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits) {
                    maxSplits = Integer.MAX_VALUE;
                }
            } else if ("--lzop-index".equals(args[i])) {
                createLzopIndex = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }

    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    JobClient.runJob(job);
    LOG.info("done");
    return 0;
}

From source file:scray.cassandra.hadoop.example.LineCounter.java

License:Apache License

public int run(String[] args) throws Exception {

    JobConf job = new JobConf(LineCounter.class);
    job.setJobName("Counting number of rows with CassandraVNodes InputFormat");
    job.setJarByClass(LineCounter.class);

    job.setReducerClass(ReducerToHDFS.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_DIR));

    job.setMapperClass(RowMapper.class);

    HashSet<String> hosts = new HashSet<String>();
    hosts.add(CASS_HOST);/*from  w w  w  .j a  v a  2  s  . c o m*/
    CassandraVNodeConfigHelper.setClusterName(job, "Test Cluster");
    CassandraVNodeConfigHelper.setDatacenter(job, DATA_CENTER);
    CassandraVNodeConfigHelper.setKeyspace(job, KEYSPACE);
    CassandraVNodeConfigHelper.setColumnFamily(job, COLUMN_FAMILY);
    CassandraVNodeConfigHelper.setNodes(job, hosts);

    job.setInputFormat((Class<InputFormat<Long, Row>>) (Object) CassandraVNodeInputFormat.class);

    JobClient.runJob(job);

    return 0;
}

From source file:source.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();

    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", getOutputReplication(job));
    TeraOutputFormat.setFinalSync(job, true);
    JobClient.runJob(job);/*  ww  w.j av a2 s .c  om*/
    LOG.info("done");
    return 0;
}

From source file:uk.bl.wa.hadoop.hosts.HostsReport.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), HostsReport.class);

    log.info("Adding logs...");
    String line;// w  ww .jav  a 2 s .com
    BufferedReader br = new BufferedReader(new FileReader(args[0]));
    while ((line = br.readLine()) != null) {
        log.info("Adding " + line);
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();

    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    conf.setJarByClass(HostsReport.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setMapperClass(HostsReportMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setCombinerClass(HostsReportReducer.class);
    conf.setReducerClass(HostsReportReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);

    JobClient.runJob(conf);
    return 0;
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job// ww w  .ja va  2 s  .  co m
 */
public void build() {
    try {
        JobConf conf = new JobConf(config);
        conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        conf.set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        conf.setBoolean("save.keys", saveKeys);
        conf.setBoolean("reducer.per.bucket", reducerPerBucket);
        if (!isAvro) {
            conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
            conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(BytesWritable.class);
            conf.setMapOutputValueClass(BytesWritable.class);
            if (reducerPerBucket) {
                conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(HadoopStoreBuilderReducer.class);
            }
        }
        conf.setInputFormat(inputFormatClass);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(BytesWritable.class);
        conf.setOutputValueClass(BytesWritable.class);
        conf.setJarByClass(getClass());
        conf.setReduceSpeculativeExecution(false);
        FileInputFormat.setInputPaths(conf, inputPath);
        conf.set("final.output.dir", outputDir.toString());
        conf.set("checksum.type", CheckSum.toString(checkSumType));
        FileOutputFormat.setOutputPath(conf, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(conf);
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(conf);
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);

        // Derive "rough" number of chunks and reducers
        int numReducers;
        if (saveKeys) {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes),
                        1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
            } else {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks;
            }
        } else {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / chunkSizeBytes), 1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions();
            } else {
                numReducers = cluster.getNumberOfPartitions() * numChunks;
            }
        }
        conf.setInt("num.chunks", numChunks);
        conf.setNumReduceTasks(numReducers);

        if (isAvro) {
            conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
            // conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(ByteBuffer.class);
            conf.setMapOutputValueClass(ByteBuffer.class);

            conf.setInputFormat(inputFormatClass);

            conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
            conf.setOutputKeyClass(ByteBuffer.class);
            conf.setOutputValueClass(ByteBuffer.class);

            // AvroJob confs for the avro mapper
            AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));

            AvroJob.setOutputSchema(conf,
                    Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES)));

            AvroJob.setMapperClass(conf, mapperClass);

            if (reducerPerBucket) {
                conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(AvroStoreBuilderReducer.class);
            }

        }

        logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: "
                + saveKeys + ", reducerPerBucket: " + reducerPerBucket);
        logger.info("Building store...");
        RunningJob job = JobClient.runJob(conf);

        // Once the job has completed log the counter
        Counters counters = job.getCounters();

        if (saveKeys) {
            if (reducerPerBucket) {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            } else {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            }
        }

        // Do a CheckSumOfCheckSum - Similar to HDFS
        CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
        if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) {
            throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType);
        }

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {

            ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();

            if (saveKeys) {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
            } else {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());
            }

            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());

            if (!outputFs.exists(nodePath)) {
                logger.info("No data generated for node " + node.getId() + ". Generating empty folder");
                outputFs.mkdirs(nodePath); // Create empty folder
                outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION));
                logger.info("Setting permission to 755 for " + nodePath);
            }

            if (checkSumType != CheckSumType.NONE) {

                FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {

                    public boolean accept(Path arg0) {
                        if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                            return true;
                        }
                        return false;
                    }
                });

                if (storeFiles != null && storeFiles.length > 0) {
                    Arrays.sort(storeFiles, new IndexFileLastComparator());
                    FSDataInputStream input = null;

                    for (FileStatus file : storeFiles) {
                        try {
                            input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            logger.debug("Checksum for file " + file.toString() + " - "
                                    + new String(Hex.encodeHex(fileCheckSum)));
                            checkSumGenerator.update(fileCheckSum);
                        } catch (Exception e) {
                            logger.error("Error while reading checksum file " + e.getMessage(), e);
                        } finally {
                            if (input != null)
                                input.close();
                        }
                        outputFs.delete(file.getPath(), false);
                    }

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType));

                    String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
                    logger.info("Checksum for node " + node.getId() + " - " + checkSum);

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
                }
            }

            // Write metadata
            Path metadataPath = new Path(nodePath, ".metadata");
            FSDataOutputStream metadataStream = outputFs.create(metadataPath);
            outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION));
            logger.info("Setting permission to 755 for " + metadataPath);
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();

        }

    } catch (Exception e) {
        logger.error("Error in Store builder", e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readwrite.mr.HadoopRWStoreBuilder.java

License:Apache License

/**
 * Run the job//from  www . java2s .  c  o  m
 */
public void build() {
    JobConf conf = new JobConf(config);
    conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
    conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
    conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
    conf.setInt("vector.node.id", this.vectorNodeId);
    conf.setLong("vector.node.version", this.vectorNodeVersion);
    conf.setLong("job.start.time.ms", System.currentTimeMillis());

    conf.setPartitionerClass(HadoopRWStoreBuilderPartitioner.class);

    conf.setInputFormat(inputFormatClass);
    conf.setMapperClass(mapperClass);
    conf.setMapOutputKeyClass(BytesWritable.class);
    conf.setMapOutputValueClass(BytesWritable.class);
    conf.setReducerClass(HadoopRWStoreBuilderReducer.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);
    conf.setReduceSpeculativeExecution(false);

    conf.setJarByClass(getClass());
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, tempPath);

    try {
        // delete the temp dir if it exists
        FileSystem tempFs = tempPath.getFileSystem(conf);
        tempFs.delete(tempPath, true);

        conf.setInt("num.chunks", reducersPerNode);
        int numReducers = cluster.getNumberOfNodes() * reducersPerNode;
        logger.info("Replication factor = " + storeDef.getReplicationFactor() + ", numNodes = "
                + cluster.getNumberOfNodes() + ", reducers per node = " + reducersPerNode + ", numReducers = "
                + numReducers);
        conf.setNumReduceTasks(numReducers);

        logger.info("Building RW store...");
        JobClient.runJob(conf);

    } catch (Exception e) {
        throw new VoldemortException(e);
    }

}