List of usage examples for org.apache.hadoop.mapreduce Job getCounters
public Counters getCounters() throws IOException
From source file:ivory.preprocess.BuildTermDocVectors2.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); LOG.info("PowerTool: BuildTermDocVectors2"); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!"); return 0; }//from w w w .j a v a2 s. c o m DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); Job job1 = new Job(conf, "BuildTermDocVectors2:" + collectionName); job1.setJarByClass(BuildTermDocVectors2.class); job1.setNumReduceTasks(0); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // write out number of postings int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable2:" + collectionName); job2.setJarByClass(BuildTermDocVectors2.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:ivory.preprocess.GetTermCount2.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { LOG.info("index path doesn't existing: skipping!"); return 0; }// w ww.j a v a 2 s. c om LOG.info("PowerTool: GetTermCount2"); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { LOG.info("TermDfCf directory exist: skipping!"); return 0; } Job job = new Job(getConf(), "GetTermCount2:" + collectionName); job.setJarByClass(GetTermCount2.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfIntLong.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // Write out number of postings. NOTE: this value is not the same as // number of postings, because postings for non-English terms are // discarded, or as result of df cut. env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue()); env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue()); return 0; }
From source file:kogiri.common.report.Report.java
License:Open Source License
private String makeText(Job job) { String jobName = job.getJobName(); String jobID = job.getJobID().toString(); String jobStatus;/*from w w w . ja v a 2s. c om*/ try { jobStatus = job.getJobState().name(); } catch (IOException ex) { jobStatus = "Unknown"; } catch (InterruptedException ex) { jobStatus = "Unknown"; } String startTimeStr; try { startTimeStr = TimeHelper.getTimeString(job.getStartTime()); } catch (Exception ex) { startTimeStr = "Unknown"; } String finishTimeStr; try { finishTimeStr = TimeHelper.getTimeString(job.getFinishTime()); } catch (Exception ex) { finishTimeStr = "Unknown"; } String timeTakenStr; try { timeTakenStr = TimeHelper.getDiffTimeString(job.getStartTime(), job.getFinishTime()); } catch (Exception ex) { timeTakenStr = "Unknown"; } String countersStr; try { countersStr = job.getCounters().toString(); } catch (Exception ex) { countersStr = "Unknown"; } return "Job : " + jobName + "\n" + "JobID : " + jobID + "\n" + "Status : " + jobStatus + "\n" + "StartTime : " + startTimeStr + "\n" + "FinishTime : " + finishTimeStr + "\n" + "TimeTaken : " + timeTakenStr + "\n\n" + countersStr; }
From source file:kogiri.mapreduce.preprocess.indexing.stage3.KmerStatisticsBuilder.java
License:Open Source License
private int runJob(PreprocessorConfig ppConfig) throws Exception { // check config validatePreprocessorConfig(ppConfig); // configuration Configuration conf = this.getConf(); // set user configuration ppConfig.getClusterConfiguration().configureTo(conf); ppConfig.saveTo(conf);//from w w w . jav a 2 s . c o m Path[] inputFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, ppConfig.getKmerIndexPath()); for (Path inputFile : inputFiles) { LOG.info(inputFile); } boolean job_result = true; List<Job> jobs = new ArrayList<Job>(); for (int round = 0; round < inputFiles.length; round++) { Path roundInputFile = inputFiles[round]; Path[] roundInputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, roundInputFile); Job job = new Job(conf, "Kogiri Preprocessor - Computing Kmer Statistics (" + round + " of " + inputFiles.length + ")"); job.setJarByClass(KmerStatisticsBuilder.class); // Mapper job.setMapperClass(KmerStatisticsBuilderMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); // Specify key / value job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); // Inputs Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf, roundInputKmerIndexPartFiles); SequenceFileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(kmerIndexPartDataFiles)); LOG.info("Input file : "); LOG.info("> " + roundInputFile.toString()); // Outputs job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); // Execute job and return status boolean result = job.waitForCompletion(true); jobs.add(job); // check results if (result) { CounterGroup uniqueGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameUnique()); CounterGroup totalGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameTotal()); CounterGroup squareGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameSquare()); CounterGroup logTFSquareGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameLogTFSquare()); Iterator<Counter> uniqueIterator = uniqueGroup.iterator(); while (uniqueIterator.hasNext()) { long count = 0; long length = 0; long square = 0; double logTFSquare = 0; double real_mean = 0; double stddev = 0; double tf_cosnorm_base = 0; Counter uniqueCounter = uniqueIterator.next(); Counter totalCounter = totalGroup.findCounter(uniqueCounter.getName()); Counter squareCounter = squareGroup.findCounter(uniqueCounter.getName()); Counter logTFSquareCounter = logTFSquareGroup.findCounter(uniqueCounter.getName()); count = uniqueCounter.getValue(); length = totalCounter.getValue(); square = squareCounter.getValue(); logTFSquare = logTFSquareCounter.getValue() / 1000.0; tf_cosnorm_base = Math.sqrt(logTFSquare); real_mean = (double) length / (double) count; // stddev = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2) double mean = Math.pow(real_mean, 2); double term = (double) square / (double) count; stddev = Math.sqrt(term - mean); LOG.info("distinct k-mers " + uniqueCounter.getName() + " : " + count); LOG.info("total k-mers " + uniqueCounter.getName() + " : " + length); LOG.info("average " + uniqueCounter.getName() + " : " + real_mean); LOG.info("std-deviation " + uniqueCounter.getName() + " : " + stddev); LOG.info("tf-cos-norm-base " + uniqueCounter.getName() + " : " + tf_cosnorm_base); Path outputHadoopPath = new Path(ppConfig.getKmerStatisticsPath(), KmerStatisticsHelper.makeKmerStatisticsFileName(uniqueCounter.getName())); FileSystem fs = outputHadoopPath.getFileSystem(conf); KmerStatistics statistics = new KmerStatistics(); statistics.setSampleName(uniqueCounter.getName()); statistics.setKmerSize(ppConfig.getKmerSize()); statistics.setUniqueKmers(count); statistics.setTotalKmers(length); statistics.setAverageFrequency(real_mean); statistics.setStdDeviation(stddev); statistics.setTFCosineNormBase(tf_cosnorm_base); statistics.saveTo(fs, outputHadoopPath); } } if (!result) { LOG.error("job failed at round " + round + " of " + inputFiles.length); job_result = false; break; } } // report if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(jobs); report.writeTo(ppConfig.getReportPath()); } return job_result ? 0 : 1; }
From source file:libra.preprocess.stage2.KmerIndexBuilder.java
License:Apache License
private int runJob(PreprocessorConfig ppConfig) throws Exception { // check config validatePreprocessorConfig(ppConfig); // configuration Configuration conf = this.getConf(); // set user configuration ppConfig.saveTo(conf);/*from w ww .java2s. c o m*/ Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath()); boolean job_result = true; List<Job> jobs = new ArrayList<Job>(); for (int round = 0; round < inputFiles.length; round++) { Path roundInputFile = inputFiles[round]; String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round; Job job = new Job(conf, "Libra Preprocessor - Building Kmer Indexes (" + round + " of " + inputFiles.length + ")"); job.setJarByClass(KmerIndexBuilder.class); // Mapper job.setMapperClass(KmerIndexBuilderMapper.class); FastaKmerInputFormat.setKmerSize(conf, ppConfig.getKmerSize()); job.setInputFormatClass(FastaKmerInputFormat.class); job.setMapOutputKeyClass(CompressedSequenceWritable.class); job.setMapOutputValueClass(IntWritable.class); // Combiner job.setCombinerClass(KmerIndexBuilderCombiner.class); // Partitioner job.setPartitionerClass(KmerIndexBuilderPartitioner.class); // Reducer job.setReducerClass(KmerIndexBuilderReducer.class); // Specify key / value job.setOutputKeyClass(CompressedSequenceWritable.class); job.setOutputValueClass(IntWritable.class); // Inputs FileInputFormat.addInputPaths(job, roundInputFile.toString()); LOG.info("Input file : "); LOG.info("> " + roundInputFile.toString()); String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName()); Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName); KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath); FileOutputFormat.setOutputPath(job, new Path(roundOutputPath)); job.setOutputFormatClass(MapFileOutputFormat.class); // Use many reducers int reducers = conf.getInt("mapred.reduce.tasks", 0); if (reducers <= 0) { int MRNodes = MapReduceClusterHelper.getNodeNum(conf); reducers = MRNodes * 2; job.setNumReduceTasks(reducers); } LOG.info("Reducers : " + reducers); // Execute job and return status boolean result = job.waitForCompletion(true); jobs.add(job); // commit results if (result) { commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath), new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize()); // create index of index createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(), ppConfig.getKmerSize()); // create statistics of index createStatisticsOfIndex(new Path(ppConfig.getKmerStatisticsPath()), roundInputFile, job.getConfiguration(), job.getCounters(), ppConfig.getKmerSize()); } if (!result) { LOG.error("job failed at round " + round + " of " + inputFiles.length); job_result = false; break; } } // report if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(jobs); report.writeTo(ppConfig.getReportPath()); } return job_result ? 0 : 1; }
From source file:ml.shifu.shifu.core.processor.stats.MapReducerStatsWorker.java
License:Apache License
protected void updateBinningInfoWithMRJob() throws IOException, InterruptedException, ClassNotFoundException { RawSourceData.SourceType source = this.modelConfig.getDataSet().getSource(); String filePath = Constants.BINNING_INFO_FILE_NAME; BufferedWriter writer = null; List<Scanner> scanners = null; try {//w w w.j av a 2 s . c om scanners = ShifuFileUtils.getDataScanners(pathFinder.getUpdatedBinningInfoPath(source), source); writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(new File(filePath)), Charset.forName("UTF-8"))); for (Scanner scanner : scanners) { while (scanner.hasNextLine()) { String line = scanner.nextLine(); writer.write(line + "\n"); } } } finally { // release processor.closeScanners(scanners); IOUtils.closeQuietly(writer); } Configuration conf = new Configuration(); prepareJobConf(source, conf, filePath); @SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Stats Updating Binning Job : " + this.modelConfig.getModelSetName()); job.setJarByClass(getClass()); job.setMapperClass(UpdateBinningInfoMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(BinningInfoWritable.class); job.setInputFormatClass(CombineInputFormat.class); FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source) .makeQualified(new Path(super.modelConfig.getDataSetRawPath()))); job.setReducerClass(UpdateBinningInfoReducer.class); int mapperSize = new CombineInputFormat().getSplits(job).size(); log.info("DEBUG: Test mapper size is {} ", mapperSize); Integer reducerSize = Environment.getInt(CommonConstants.SHIFU_UPDATEBINNING_REDUCER); if (reducerSize != null) { job.setNumReduceTasks(Environment.getInt(CommonConstants.SHIFU_UPDATEBINNING_REDUCER, 20)); } else { // By average, each reducer handle 100 variables int newReducerSize = (this.columnConfigList.size() / 100) + 1; // if(newReducerSize < 1) { // newReducerSize = 1; // } // if(newReducerSize > 500) { // newReducerSize = 500; // } log.info("Adjust updating binning info reducer size to {} ", newReducerSize); job.setNumReduceTasks(newReducerSize); } job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); String preTrainingInfo = this.pathFinder.getPreTrainingStatsPath(source); FileOutputFormat.setOutputPath(job, new Path(preTrainingInfo)); // clean output firstly ShifuFileUtils.deleteFile(preTrainingInfo, source); // submit job if (!job.waitForCompletion(true)) { FileUtils.deleteQuietly(new File(filePath)); throw new RuntimeException("MapReduce Job Updateing Binning Info failed."); } else { long totalValidCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT") .getValue(); long invalidTagCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG") .getValue(); long filterOut = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT") .getValue(); long weightExceptions = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "WEIGHT_EXCEPTION") .getValue(); log.info( "Total valid records {}, invalid tag records {}, filter out records {}, weight exception records {}", totalValidCount, invalidTagCount, filterOut, weightExceptions); if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) { log.warn( "Too many invalid tags, please check you configuration on positive tags and negative tags."); } } FileUtils.deleteQuietly(new File(filePath)); }
From source file:mvm.rya.accumulo.mr.fileinput.RdfFileInputByLineTool.java
License:Apache License
public long runJob(String[] args) throws IOException, ClassNotFoundException, InterruptedException, AccumuloSecurityException { conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.set("io.sort.mb", "256"); conf.setLong("mapred.task.timeout", 600000000); zk = conf.get(MRUtils.AC_ZK_PROP, zk); instance = conf.get(MRUtils.AC_INSTANCE_PROP, instance); userName = conf.get(MRUtils.AC_USERNAME_PROP, userName); pwd = conf.get(MRUtils.AC_PWD_PROP, pwd); format = RDFFormat.valueOf(conf.get(MRUtils.FORMAT_PROP, RDFFormat.NTRIPLES.toString())); String tablePrefix = conf.get(MRUtils.TABLE_PREFIX_PROPERTY, RdfCloudTripleStoreConstants.TBL_PRFX_DEF); Job job = new Job(conf); job.setJarByClass(RdfFileInputByLineTool.class); // set up cloudbase input job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); // set input output of the particular job job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Mutation.class); job.setOutputFormatClass(AccumuloOutputFormat.class); AccumuloOutputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd.getBytes())); AccumuloOutputFormat.setCreateTables(job, true); AccumuloOutputFormat.setDefaultTableName(job, tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX); AccumuloOutputFormat.setZooKeeperInstance(job, instance, zk); // set mapper and reducer classes job.setMapperClass(TextToMutationMapper.class); job.setNumReduceTasks(0);/*w w w .java 2s.c o m*/ // Submit the job Date startTime = new Date(); System.out.println("Job started: " + startTime); int exitCode = job.waitForCompletion(true) ? 0 : 1; if (exitCode == 0) { Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS") .getValue(); } else { System.out.println("Job Failed!!!"); } return -1; }
From source file:mvm.rya.accumulo.mr.fileinput.RdfFileInputTool.java
License:Apache License
public long runJob(String[] args) throws IOException, ClassNotFoundException, InterruptedException, AccumuloSecurityException { conf.set(MRUtils.JOB_NAME_PROP, "Rdf File Input"); //faster/* w w w .j a va 2 s. c om*/ init(); format = conf.get(MRUtils.FORMAT_PROP, format); conf.set(MRUtils.FORMAT_PROP, format); String inputPath = conf.get(MRUtils.INPUT_PATH, args[0]); Job job = new Job(conf); job.setJarByClass(RdfFileInputTool.class); // set up cloudbase input job.setInputFormatClass(RdfFileInputFormat.class); RdfFileInputFormat.addInputPath(job, new Path(inputPath)); // set input output of the particular job job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(RyaStatementWritable.class); setupOutputFormat(job, tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX); // set mapper and reducer classes job.setMapperClass(StatementToMutationMapper.class); job.setNumReduceTasks(0); // Submit the job Date startTime = new Date(); System.out.println("Job started: " + startTime); int exitCode = job.waitForCompletion(true) ? 0 : 1; if (exitCode == 0) { Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS") .getValue(); } else { System.out.println("Job Failed!!!"); } return -1; }
From source file:mvm.rya.accumulo.pig.IndexWritingTool.java
License:Apache License
@Override public int run(final String[] args) throws Exception { Preconditions.checkArgument(args.length == 7, "java " + IndexWritingTool.class.getCanonicalName() + " hdfsSaveLocation sparqlFile cbinstance cbzk cbuser cbpassword rdfTablePrefix."); final String inputDir = args[0]; final String sparqlFile = args[1]; final String instStr = args[2]; final String zooStr = args[3]; final String userStr = args[4]; final String passStr = args[5]; final String tablePrefix = args[6]; String sparql = FileUtils.readFileToString(new File(sparqlFile)); Job job = new Job(getConf(), "Write HDFS Index to Accumulo"); job.setJarByClass(this.getClass()); Configuration jobConf = job.getConfiguration(); jobConf.setBoolean("mapred.map.tasks.speculative.execution", false); setVarOrders(sparql, jobConf);//from ww w . java 2 s . co m TextInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Mutation.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Mutation.class); job.setNumReduceTasks(0); String tableName; if (zooStr.equals("mock")) { tableName = tablePrefix; } else { tableName = tablePrefix + "INDEX_" + UUID.randomUUID().toString().replace("-", "").toUpperCase(); } setAccumuloOutput(instStr, zooStr, userStr, passStr, job, tableName); jobConf.set(sparql_key, sparql); int complete = job.waitForCompletion(true) ? 0 : -1; if (complete == 0) { String[] varOrders = jobConf.getStrings("varOrders"); String orders = Joiner.on("\u0000").join(varOrders); Instance inst; if (zooStr.equals("mock")) { inst = new MockInstance(instStr); } else { inst = new ZooKeeperInstance(instStr, zooStr); } Connector conn = inst.getConnector(userStr, passStr.getBytes()); BatchWriter bw = conn.createBatchWriter(tableName, 10, 5000, 1); Counters counters = job.getCounters(); Counter c1 = counters.findCounter(cardCounter, cardCounter); Mutation m = new Mutation("~SPARQL"); Value v = new Value(sparql.getBytes()); m.put(new Text("" + c1.getValue()), new Text(orders), v); bw.addMutation(m); bw.close(); return complete; } else { return complete; } }
From source file:net.broomie.JpWordCounter.java
License:Apache License
/** * This method is implement for creating the dfdb with MapReduce. * @param conf Specify the conf object, which is hadoop Configuration. * @param dfdb Specify the dfdb directory path on HDFS. * @return Return `true' if success, return `false' if fail. * @throws IOException Exception for a input file IO. * @throws InterruptedException Exception for return waitForCompletion(). * @throws ClassNotFoundException Exception for Mapper and Reduce class. * @throws URISyntaxException Exception for new URI(). * The dfdb means `document frequency'./*w w w . ja v a 2 s.co m*/ */ private boolean runCreateDFDB(Configuration conf, String dfdb) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException { String reducerNum = conf.get(WORD_COUNTER_REDUCER_NUM); Job job = new Job(conf); job.setJarByClass(JpWordCounter.class); TextInputFormat.addInputPath(job, new Path(in)); FileSystem fs = FileSystem.get(new URI(dfdb), conf); FileStatus[] status = fs.listStatus(new Path(dfdb)); if (status != null) { fs.delete(new Path(dfdb), true); } fs.close(); FileOutputFormat.setOutputPath(job, new Path(dfdb)); job.setMapperClass(DFMapper.class); job.setReducerClass(TokenizeReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(Integer.valueOf(reducerNum)); boolean rv = job.waitForCompletion(true); if (rv) { Counters counters = job.getCounters(); long inputNum = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); FileSystem hdfs = FileSystem.get(conf); String numLinePath = conf.get(PROP_LINE_NUM); FSDataOutputStream stream = hdfs.create(new Path(numLinePath)); stream.writeUTF(String.valueOf((int) inputNum)); stream.close(); } return rv; }