Example usage for org.apache.hadoop.mapred JobConf setReduceSpeculativeExecution

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReduceSpeculativeExecution.

Prototype

public void setReduceSpeculativeExecution(boolean speculativeExecution)

Source Link

Document

Turn speculative execution on or off for this job for reduce tasks.

Usage

From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments,
        boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize)
        throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("RaovatIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone);
    LOG.info("RaovatIndexer: URL filtering: " + filter);
    LOG.info("RaovatIndexer: URL normalizing: " + normalize);

    ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job);

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(ReSolrConstants.PARAMS, solrParams);
    }/*ww w.ja  v  a 2s  .  c  o  m*/
    NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        //            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
        //
        //            if (!noCommit) {
        //                solr.commit();
        //            }
        long end = System.currentTimeMillis();
        LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error("RaovatIndexer:" + e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit,
        boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("RaovatIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone);
    LOG.info("RaovatIndexer: URL filtering: " + filter);
    LOG.info("RaovatIndexer: URL normalizing: " + normalize);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(ReSolrConstants.PARAMS, solrParams);
    }/*w w  w . j a va  2s.  c  o  m*/
    NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        //            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
        //
        //            if (!noCommit) {
        //                solr.commit();
        //            }
        long end = System.currentTimeMillis();
        LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error("RaovatIndexer: " + e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:net.peacesoft.nutch.crawl.ReSolrIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments,
        boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize)
        throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
    LOG.info("SolrIndexer: URL filtering: " + filter);
    LOG.info("SolrIndexer: URL normalizing: " + normalize);

    ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job);

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(ReSolrConstants.PARAMS, solrParams);
    }/* w w w .ja  v a 2  s . co  m*/
    NutchIndexWriterFactory.addClassToConf(job, ReSolrWriter.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

        if (!noCommit) {
            solr.commit();
        }
        long end = System.currentTimeMillis();
        LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error(e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:net.peacesoft.nutch.crawl.ReSolrIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit,
        boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
    LOG.info("SolrIndexer: URL filtering: " + filter);
    LOG.info("SolrIndexer: URL normalizing: " + normalize);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(ReSolrConstants.PARAMS, solrParams);
    }//from  ww  w . j  av a  2  s .  c o  m
    NutchIndexWriterFactory.addClassToConf(job, ReSolrWriter.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

        if (!noCommit) {
            solr.commit();
        }
        long end = System.currentTimeMillis();
        LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error(e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:org.apache.nutch.indexer.solr.SolrIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit,
        boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
    LOG.info("SolrIndexer: URL filtering: " + filter);
    LOG.info("SolrIndexer: URL normalizing: " + normalize);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(SolrConstants.PARAMS, solrParams);
    }/*from w  w w . j  av  a  2  s . c  o  m*/
    NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

        if (!noCommit) {
            solr.commit();
        }
        long end = System.currentTimeMillis();
        LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error(e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:org.cloudata.examples.upload.partitionjob.UploadJob.java

License:Apache License

public void runJob(String inputPath, String tableName) throws IOException {
    JobConf jobConf = new JobConf(UploadJob.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")");

    //KeyRangePartitioner    
    //AbstractTabletInputFormat.OUTPUT_TABLE? ? 
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);

    CloudataConf conf = new CloudataConf();
    CTable ctable = CTable.openTable(conf, tableName);
    TabletInfo[] tabletInfos = ctable.listTabletInfos();

    //<Map>
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(UploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);/* ww w  . jav  a 2  s. co m*/
    jobConf.setPartitionerClass(KeyRangePartitioner.class);
    //</Map>

    //<Reduce>
    Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer");
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setReducerClass(UploadReducer.class);
    jobConf.setReduceSpeculativeExecution(false);
    jobConf.setMaxReduceAttempts(0);
    //Reduce  Tablet 
    jobConf.setNumReduceTasks(tabletInfos.length);
    //</Reduce>

    try {
        JobClient.runJob(jobConf);
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        FileUtil.delete(fs, tempOutputPath, true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.terrier.applications.HadoopIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args/*from  w w w  .j  a  va  2 s  .  co m*/
 * @throws Exception
 */
public static void main(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.info("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return;
    } else if (args.length == 0) {
        logger.info("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    } else {
        logger.fatal(usage());
        return;
    }

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJobName("terrierIndexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return;
    }

    boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));
    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    JobID jobId = null;
    boolean ranOK = true;
    try {
        RunningJob rj = JobClient.runJob(conf);
        jobId = rj.getID();
        HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) {
        logger.error("Problem running job", e);
        ranOK = false;
    }
    if (jobId != null) {
        deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }
    if (ranOK) {
        if (!docPartitioned) {
            if (numberOfReducers > 1)
                mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        }

        Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
                docPartitioned ? numberOfReducers : 1, jf);
    }
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
}

From source file:org.terrier.structures.indexing.CompressingMetaIndexBuilder.java

License:Mozilla Public License

/**
 * reverseAsMapReduceJob/*from w w w.  jav a2s .  c  o  m*/
 * @param index
 * @param structureName
 * @param keys
 * @param jf
 * @throws Exception
 */
//@SuppressWarnings("deprecation")
public static void reverseAsMapReduceJob(IndexOnDisk index, String structureName, String[] keys,
        HadoopPlugin.JobFactory jf) throws Exception {
    long time = System.currentTimeMillis();
    final JobConf conf = jf.newJob();
    conf.setJobName("Reverse MetaIndex");
    conf.setMapOutputKeyClass(KeyValueTuple.class);
    conf.setMapOutputValueClass(IntWritable.class);
    conf.setMapperClass(MapperReducer.class);
    conf.setReducerClass(MapperReducer.class);
    conf.setNumReduceTasks(keys.length);
    conf.setPartitionerClass(KeyedPartitioner.class);
    conf.setInputFormat(CompressingMetaIndexInputFormat.class);
    conf.setReduceSpeculativeExecution(false);
    conf.set("MetaIndexInputStreamRecordReader.structureName", structureName);
    conf.setInt("CompressingMetaIndexBuilder.reverse.keyCount", keys.length);
    conf.set("CompressingMetaIndexBuilder.reverse.keys", ArrayUtils.join(keys, ","));
    conf.set("CompressingMetaIndexBuilder.forward.valueLengths",
            index.getIndexProperty("index." + structureName + ".value-lengths", ""));
    conf.set("CompressingMetaIndexBuilder.forward.keys",
            index.getIndexProperty("index." + structureName + ".key-names", ""));
    FileOutputFormat.setOutputPath(conf, new Path(index.getPath()));
    HadoopUtility.toHConfiguration(index, conf);

    conf.setOutputFormat(NullOutputFormat.class);
    try {
        RunningJob rj = JobClient.runJob(conf);
        rj.getID();
        HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) {
        throw new Exception("Problem running job to reverse metadata", e);
    }
    //only update the index from the controlling process, so that we dont have locking/concurrency issues
    index.setIndexProperty("index." + structureName + ".reverse-key-names", ArrayUtils.join(keys, ","));
    index.flush();
    logger.info("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job/* ww  w.  ja  va 2 s .  com*/
 */
public void build() {
    try {
        JobConf conf = new JobConf(config);
        conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        conf.set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        conf.setBoolean("save.keys", saveKeys);
        conf.setBoolean("reducer.per.bucket", reducerPerBucket);
        if (!isAvro) {
            conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
            conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(BytesWritable.class);
            conf.setMapOutputValueClass(BytesWritable.class);
            if (reducerPerBucket) {
                conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(HadoopStoreBuilderReducer.class);
            }
        }
        conf.setInputFormat(inputFormatClass);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(BytesWritable.class);
        conf.setOutputValueClass(BytesWritable.class);
        conf.setJarByClass(getClass());
        conf.setReduceSpeculativeExecution(false);
        FileInputFormat.setInputPaths(conf, inputPath);
        conf.set("final.output.dir", outputDir.toString());
        conf.set("checksum.type", CheckSum.toString(checkSumType));
        FileOutputFormat.setOutputPath(conf, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(conf);
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(conf);
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);

        // Derive "rough" number of chunks and reducers
        int numReducers;
        if (saveKeys) {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes),
                        1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
            } else {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks;
            }
        } else {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / chunkSizeBytes), 1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions();
            } else {
                numReducers = cluster.getNumberOfPartitions() * numChunks;
            }
        }
        conf.setInt("num.chunks", numChunks);
        conf.setNumReduceTasks(numReducers);

        if (isAvro) {
            conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
            // conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(ByteBuffer.class);
            conf.setMapOutputValueClass(ByteBuffer.class);

            conf.setInputFormat(inputFormatClass);

            conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
            conf.setOutputKeyClass(ByteBuffer.class);
            conf.setOutputValueClass(ByteBuffer.class);

            // AvroJob confs for the avro mapper
            AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));

            AvroJob.setOutputSchema(conf,
                    Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES)));

            AvroJob.setMapperClass(conf, mapperClass);

            if (reducerPerBucket) {
                conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(AvroStoreBuilderReducer.class);
            }

        }

        logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: "
                + saveKeys + ", reducerPerBucket: " + reducerPerBucket);
        logger.info("Building store...");
        RunningJob job = JobClient.runJob(conf);

        // Once the job has completed log the counter
        Counters counters = job.getCounters();

        if (saveKeys) {
            if (reducerPerBucket) {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            } else {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            }
        }

        // Do a CheckSumOfCheckSum - Similar to HDFS
        CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
        if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) {
            throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType);
        }

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {

            ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();

            if (saveKeys) {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
            } else {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());
            }

            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());

            if (!outputFs.exists(nodePath)) {
                logger.info("No data generated for node " + node.getId() + ". Generating empty folder");
                outputFs.mkdirs(nodePath); // Create empty folder
                outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION));
                logger.info("Setting permission to 755 for " + nodePath);
            }

            if (checkSumType != CheckSumType.NONE) {

                FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {

                    public boolean accept(Path arg0) {
                        if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                            return true;
                        }
                        return false;
                    }
                });

                if (storeFiles != null && storeFiles.length > 0) {
                    Arrays.sort(storeFiles, new IndexFileLastComparator());
                    FSDataInputStream input = null;

                    for (FileStatus file : storeFiles) {
                        try {
                            input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            logger.debug("Checksum for file " + file.toString() + " - "
                                    + new String(Hex.encodeHex(fileCheckSum)));
                            checkSumGenerator.update(fileCheckSum);
                        } catch (Exception e) {
                            logger.error("Error while reading checksum file " + e.getMessage(), e);
                        } finally {
                            if (input != null)
                                input.close();
                        }
                        outputFs.delete(file.getPath(), false);
                    }

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType));

                    String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
                    logger.info("Checksum for node " + node.getId() + " - " + checkSum);

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
                }
            }

            // Write metadata
            Path metadataPath = new Path(nodePath, ".metadata");
            FSDataOutputStream metadataStream = outputFs.create(metadataPath);
            outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION));
            logger.info("Setting permission to 755 for " + metadataPath);
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();

        }

    } catch (Exception e) {
        logger.error("Error in Store builder", e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readwrite.mr.HadoopRWStoreBuilder.java

License:Apache License

/**
 * Run the job//from  w w  w .ja  va2 s .  c om
 */
public void build() {
    JobConf conf = new JobConf(config);
    conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
    conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
    conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
    conf.setInt("vector.node.id", this.vectorNodeId);
    conf.setLong("vector.node.version", this.vectorNodeVersion);
    conf.setLong("job.start.time.ms", System.currentTimeMillis());

    conf.setPartitionerClass(HadoopRWStoreBuilderPartitioner.class);

    conf.setInputFormat(inputFormatClass);
    conf.setMapperClass(mapperClass);
    conf.setMapOutputKeyClass(BytesWritable.class);
    conf.setMapOutputValueClass(BytesWritable.class);
    conf.setReducerClass(HadoopRWStoreBuilderReducer.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);
    conf.setReduceSpeculativeExecution(false);

    conf.setJarByClass(getClass());
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, tempPath);

    try {
        // delete the temp dir if it exists
        FileSystem tempFs = tempPath.getFileSystem(conf);
        tempFs.delete(tempPath, true);

        conf.setInt("num.chunks", reducersPerNode);
        int numReducers = cluster.getNumberOfNodes() * reducersPerNode;
        logger.info("Replication factor = " + storeDef.getReplicationFactor() + ", numNodes = "
                + cluster.getNumberOfNodes() + ", reducers per node = " + reducersPerNode + ", numReducers = "
                + numReducers);
        conf.setNumReduceTasks(numReducers);

        logger.info("Building RW store...");
        JobClient.runJob(conf);

    } catch (Exception e) {
        throw new VoldemortException(e);
    }

}