List of usage examples for org.apache.hadoop.mapred JobConf setReduceSpeculativeExecution
public void setReduceSpeculativeExecution(boolean speculativeExecution)
From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("RaovatIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone); LOG.info("RaovatIndexer: URL filtering: " + filter); LOG.info("RaovatIndexer: URL normalizing: " + normalize); ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(ReSolrConstants.PARAMS, solrParams); }/*ww w.ja v a 2s . c o m*/ NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go // SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); // // if (!noCommit) { // solr.commit(); // } long end = System.currentTimeMillis(); LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error("RaovatIndexer:" + e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("RaovatIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone); LOG.info("RaovatIndexer: URL filtering: " + filter); LOG.info("RaovatIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(ReSolrConstants.PARAMS, solrParams); }/*w w w . j a va 2s. c o m*/ NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go // SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); // // if (!noCommit) { // solr.commit(); // } long end = System.currentTimeMillis(); LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error("RaovatIndexer: " + e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:net.peacesoft.nutch.crawl.ReSolrIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(ReSolrConstants.PARAMS, solrParams); }/* w w w .ja v a 2 s . co m*/ NutchIndexWriterFactory.addClassToConf(job, ReSolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:net.peacesoft.nutch.crawl.ReSolrIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(ReSolrConstants.PARAMS, solrParams); }//from ww w . j av a 2 s . c o m NutchIndexWriterFactory.addClassToConf(job, ReSolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:org.apache.nutch.indexer.solr.SolrIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(SolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(SolrConstants.PARAMS, solrParams); }/*from w w w . j av a 2 s . c o m*/ NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:org.cloudata.examples.upload.partitionjob.UploadJob.java
License:Apache License
public void runJob(String inputPath, String tableName) throws IOException { JobConf jobConf = new JobConf(UploadJob.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")"); //KeyRangePartitioner //AbstractTabletInputFormat.OUTPUT_TABLE? ? jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); CloudataConf conf = new CloudataConf(); CTable ctable = CTable.openTable(conf, tableName); TabletInfo[] tabletInfos = ctable.listTabletInfos(); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(UploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0);/* ww w . jav a 2 s. co m*/ jobConf.setPartitionerClass(KeyRangePartitioner.class); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setReducerClass(UploadReducer.class); jobConf.setReduceSpeculativeExecution(false); jobConf.setMaxReduceAttempts(0); //Reduce Tablet jobConf.setNumReduceTasks(tabletInfos.length); //</Reduce> try { JobClient.runJob(jobConf); } finally { FileSystem fs = FileSystem.get(jobConf); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.terrier.applications.HadoopIndexing.java
License:Mozilla Public License
/** Starts the MapReduce indexing. * @param args/*from w w w .j a va 2 s . co m*/ * @throws Exception */ public static void main(String[] args) throws Exception { long time = System.currentTimeMillis(); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.info("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return; } else if (args.length == 0) { logger.info("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } else { logger.fatal(usage()); return; } if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJobName("terrierIndexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return; } boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } JobID jobId = null; boolean ranOK = true; try { RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } if (ranOK) { if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); } System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); }
From source file:org.terrier.structures.indexing.CompressingMetaIndexBuilder.java
License:Mozilla Public License
/** * reverseAsMapReduceJob/*from w w w. jav a2s . c o m*/ * @param index * @param structureName * @param keys * @param jf * @throws Exception */ //@SuppressWarnings("deprecation") public static void reverseAsMapReduceJob(IndexOnDisk index, String structureName, String[] keys, HadoopPlugin.JobFactory jf) throws Exception { long time = System.currentTimeMillis(); final JobConf conf = jf.newJob(); conf.setJobName("Reverse MetaIndex"); conf.setMapOutputKeyClass(KeyValueTuple.class); conf.setMapOutputValueClass(IntWritable.class); conf.setMapperClass(MapperReducer.class); conf.setReducerClass(MapperReducer.class); conf.setNumReduceTasks(keys.length); conf.setPartitionerClass(KeyedPartitioner.class); conf.setInputFormat(CompressingMetaIndexInputFormat.class); conf.setReduceSpeculativeExecution(false); conf.set("MetaIndexInputStreamRecordReader.structureName", structureName); conf.setInt("CompressingMetaIndexBuilder.reverse.keyCount", keys.length); conf.set("CompressingMetaIndexBuilder.reverse.keys", ArrayUtils.join(keys, ",")); conf.set("CompressingMetaIndexBuilder.forward.valueLengths", index.getIndexProperty("index." + structureName + ".value-lengths", "")); conf.set("CompressingMetaIndexBuilder.forward.keys", index.getIndexProperty("index." + structureName + ".key-names", "")); FileOutputFormat.setOutputPath(conf, new Path(index.getPath())); HadoopUtility.toHConfiguration(index, conf); conf.setOutputFormat(NullOutputFormat.class); try { RunningJob rj = JobClient.runJob(conf); rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { throw new Exception("Problem running job to reverse metadata", e); } //only update the index from the controlling process, so that we dont have locking/concurrency issues index.setIndexProperty("index." + structureName + ".reverse-key-names", ArrayUtils.join(keys, ",")); index.flush(); logger.info("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java
License:Apache License
/** * Run the job/* ww w. ja va 2 s . com*/ */ public void build() { try { JobConf conf = new JobConf(config); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean("save.keys", saveKeys); conf.setBoolean("reducer.per.bucket", reducerPerBucket); if (!isAvro) { conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); if (reducerPerBucket) { conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(HadoopStoreBuilderReducer.class); } } conf.setInputFormat(inputFormatClass); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set("checksum.type", CheckSum.toString(checkSumType)); FileOutputFormat.setOutputPath(conf, tempDir); FileSystem outputFs = outputDir.getFileSystem(conf); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(conf); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes); // Derive "rough" number of chunks and reducers int numReducers; if (saveKeys) { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor(); } else { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks; } } else { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions(); } else { numReducers = cluster.getNumberOfPartitions() * numChunks; } } conf.setInt("num.chunks", numChunks); conf.setNumReduceTasks(numReducers); if (isAvro) { conf.setPartitionerClass(AvroStoreBuilderPartitioner.class); // conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(ByteBuffer.class); conf.setMapOutputValueClass(ByteBuffer.class); conf.setInputFormat(inputFormatClass); conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class); conf.setOutputKeyClass(ByteBuffer.class); conf.setOutputValueClass(ByteBuffer.class); // AvroJob confs for the avro mapper AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema"))); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES))); AvroJob.setMapperClass(conf, mapperClass); if (reducerPerBucket) { conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(AvroStoreBuilderReducer.class); } } logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: " + saveKeys + ", reducerPerBucket: " + reducerPerBucket); logger.info("Building store..."); RunningJob job = JobClient.runJob(conf); // Once the job has completed log the counter Counters counters = job.getCounters(); if (saveKeys) { if (reducerPerBucket) { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } else { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } } // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType); } // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); } else { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); } Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { logger.info("No data generated for node " + node.getId() + ". Generating empty folder"); outputFs.mkdirs(nodePath); // Create empty folder outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + nodePath); } if (checkSumType != CheckSumType.NONE) { FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null && storeFiles.length > 0) { Arrays.sort(storeFiles, new IndexFileLastComparator()); FSDataInputStream input = null; for (FileStatus file : storeFiles) { try { input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); logger.debug("Checksum for file " + file.toString() + " - " + new String(Hex.encodeHex(fileCheckSum))); checkSumGenerator.update(fileCheckSum); } catch (Exception e) { logger.error("Error while reading checksum file " + e.getMessage(), e); } finally { if (input != null) input.close(); } outputFs.delete(file.getPath(), false); } metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum())); logger.info("Checksum for node " + node.getId() + " - " + checkSum); metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum); } } // Write metadata Path metadataPath = new Path(nodePath, ".metadata"); FSDataOutputStream metadataStream = outputFs.create(metadataPath); outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + metadataPath); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } } catch (Exception e) { logger.error("Error in Store builder", e); throw new VoldemortException(e); } }
From source file:voldemort.store.readwrite.mr.HadoopRWStoreBuilder.java
License:Apache License
/** * Run the job//from w w w .ja va2 s . c om */ public void build() { JobConf conf = new JobConf(config); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setInt("vector.node.id", this.vectorNodeId); conf.setLong("vector.node.version", this.vectorNodeVersion); conf.setLong("job.start.time.ms", System.currentTimeMillis()); conf.setPartitionerClass(HadoopRWStoreBuilderPartitioner.class); conf.setInputFormat(inputFormatClass); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setReducerClass(HadoopRWStoreBuilderReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setReduceSpeculativeExecution(false); conf.setJarByClass(getClass()); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, tempPath); try { // delete the temp dir if it exists FileSystem tempFs = tempPath.getFileSystem(conf); tempFs.delete(tempPath, true); conf.setInt("num.chunks", reducersPerNode); int numReducers = cluster.getNumberOfNodes() * reducersPerNode; logger.info("Replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", reducers per node = " + reducersPerNode + ", numReducers = " + numReducers); conf.setNumReduceTasks(numReducers); logger.info("Building RW store..."); JobClient.runJob(conf); } catch (Exception e) { throw new VoldemortException(e); } }