List of usage examples for org.apache.hadoop.mapred Counters findCounter
@Deprecated public Counter findCounter(String group, int id, String name)
From source file:edu.umd.cloud9.collection.medline.NumberMedlineCitations.java
License:Apache License
/** * Runs this tool./* ww w . j a v a2s .co m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; int mapTasks = Integer.parseInt(args[3]); sLogger.info("Tool name: NumberMedlineCitations"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Output file: " + outputFile); sLogger.info("Launching with " + mapTasks + " mappers..."); JobConf conf = new JobConf(getConf(), NumberMedlineCitations.class); conf.setJobName("NumberMedlineCitations"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(MedlineCitationInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); // write out various properties Counters counters = job.getCounters(); Counter counter = counters.findCounter("edu.umd.cloud9.collection.medline.NumberMedlineCitations$Citations", 0, ""); int numdocs = (int) counter.getCounter(); sLogger.info("total number of docs: " + numdocs); MedlineDocnoMapping.writeDocidData(outputPath + "/part-00000", outputFile); return 0; }
From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); FileSystem fs = FileSystem.get(conf); String collectionName = conf.get("Ivory.CollectionName"); String indexPaths = conf.get("Ivory.IndexPaths"); String dataOutputPath = conf.get("Ivory.DataOutputPath"); int dfThreshold = conf.getInt("Ivory.DfThreshold", 0); // first, compute size of global term space Path tmpPaths = new Path("/tmp/index-paths.txt"); FSDataOutputStream out = fs.create(tmpPaths, true); for (String s : indexPaths.split(",")) { out.write(new String(s + "\n").getBytes()); }//from w ww.j av a 2s. c om out.close(); LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments"); conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS") .getCounter(); LOG.info("total number of terms in global dictionary = " + totalNumTerms); // now build the dictionary fs.delete(new Path(dataOutputPath), true); conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); LOG.info("Job: MergeGlobalStatsAcrossIndexSegments"); conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms); startTime = System.currentTimeMillis(); job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // compute some # docs, collection length, avg doc length long collectionLength = 0; int docCount = 0; for (String index : indexPaths.split(",")) { LOG.info("reading stats for " + index); RetrievalEnvironment env = new RetrievalEnvironment(index, fs); long l = env.readCollectionLength(); int n = env.readCollectionDocumentCount(); LOG.info(" - CollectionLength: " + l); LOG.info(" - CollectionDocumentCount: " + n); collectionLength += l; docCount += n; } float avgdl = (float) collectionLength / docCount; LOG.info("all index segments: "); LOG.info(" - CollectionLength: " + collectionLength); LOG.info(" - CollectionDocumentCount: " + docCount); LOG.info(" - AverageDocumentLenght: " + avgdl); RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs); env.writeCollectionAverageDocumentLength(avgdl); env.writeCollectionLength(collectionLength); env.writeCollectionDocumentCount(docCount); return 0; }