List of usage examples for org.apache.hadoop.mapreduce Job getCounters
public Counters getCounters() throws IOException
From source file:ilps.hadoop.bin.ToyKbaSystem.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*from w ww . ja v a 2 s . c o m*/ String out = null; String queryfile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-c".equals(args[i])) { corpus_id = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Toy KBA system"); job.setJarByClass(ToyKbaSystem.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringLongPair.class); // job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; /* for (String g : job.getCounters().getGroupNames()) { Iterator<org.apache.hadoop.mapreduce.Counter> it = job.getCounters() .getGroup(g).iterator(); LOG.info(g + "\t" + job.getCounters().getGroup(g).getDisplayName()); while (it.hasNext()) { org.apache.hadoop.mapreduce.Counter c = it.next(); LOG.info("\t" + c.getDisplayName() + "\t" + c.getValue()); } } */ // add some more statistics Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Filter_run.Factory().toJSON(fr)); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString()); } System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:io.aos.mapreduce.count.WordCountToolTest.java
License:Apache License
@Test public void testJob() throws Exception { final Job job = Job.getInstance(mrCluster.getConfig()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); job.setNumReduceTasks(1);//from www .j av a2 s. c om FileInputFormat.setInputPaths(job, inDir); FileOutputFormat.setOutputPath(job, new Path(outDir, "testJob")); assertTrue(job.waitForCompletion(true)); validateCounters(job.getCounters(), 5, 25, 5, 5); }
From source file:io.covert.dns.collection.CollectionJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { usage(""); }/*w ww. j a v a 2s . c o m*/ String dclass = args[0]; String types = args[1]; String inDir = args[2]; String outDir = args[3]; Configuration conf = getConf(); if (conf.get("dns.collection.num.resolvers") == null) conf.setInt("dns.collection.num.resolvers", 50); if (conf.get("dns.collection.nameservers") == null) conf.set("dns.collection.nameservers", "127.0.0.1"); Job job = new Job(conf); job.setJobName(CollectionJob.class.getSimpleName() + ": types=" + types + ", dclass=" + dclass + " inDir=" + inDir + ", outDir=" + outDir + ", resolvers=" + conf.get("dns.collection.nameservers")); job.setJarByClass(getClass()); job.setMapperClass(CollectionMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormatClass(DnsRequestInputFormat.class); DnsRequestInputFormat.setInputPaths(job, new Path(inDir)); DnsRequestInputFormat.configure(job, dclass.toUpperCase(), Arrays.asList(types.split(",")), Arrays.asList("")); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outDir)); SequenceFileOutputFormat.setCompressOutput(job, true); job.submit(); int retVal = job.waitForCompletion(true) ? 0 : 1; CounterGroup counters = job.getCounters().getGroup(CollectionMapper.RESOLVER_GROUP); Counter constructMessageMS = counters.findCounter(CollectionMapper.CONSTRUCT_MESSAGE_MS); Counter parseResponseMS = counters.findCounter(CollectionMapper.PARSE_RESPONSE_MS); Counter performRequestMS = counters.findCounter(CollectionMapper.PERFORM_REQUEST_MS); Counter totalRequestHandlingMS = counters.findCounter(CollectionMapper.TOTAL_REQUEST_HANDLING_MS); Log.info("Total ConstructMessage percent: " + (double) (constructMessageMS.getValue() * 100L) / ((double) totalRequestHandlingMS.getValue())); Log.info("Total ParseResponse percent: " + (double) (parseResponseMS.getValue() * 100L) / ((double) totalRequestHandlingMS.getValue())); Log.info("Total PerformRequest percent: " + (double) (performRequestMS.getValue() * 100L) / ((double) totalRequestHandlingMS.getValue())); return retVal; }
From source file:io.dataapps.chlorine.hadoop.DeepScanPipeline.java
License:Apache License
public void run() { try {// w ww.ja v a 2s.c o m final Path fsScanPath = new Path(scanPath); final Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); if (findersFilePath != null) { fs.copyFromLocalFile(false, true, new Path(findersFilePath), new Path("chlorine_finders.xml")); } Job job = HDFSScanMR.makeJob(conf, fsScanPath, new Path(jobOutputDir), matchPath, scanSince, findersFilePath, queue, maskPath); boolean bResult = runJobToCompletion(job); if (bResult) { LOG.info("Total bytes scanned = " + job.getCounters().findCounter("Feature", "TotalSize").getValue()); LOG.info("Total records scanned = " + job.getCounters() .findCounter("org.apache.hadoop.mapreduce.TaskCounter", "MAP_INPUT_RECORDS").getValue()); LOG.info("Total Matched records = " + job.getCounters().findCounter("Feature", "MatchedRecords").getValue()); LOG.info("Total matches = " + job.getCounters().findCounter("Feature", "TotalMatches").getValue()); FinderEngine engine = new FinderEngine(); for (Finder finder : engine.getFinders()) { long l = job.getCounters().findCounter("Feature", finder.getName()).getValue(); if (l > 0) { LOG.info(finder.getName() + " = " + job.getCounters().findCounter("Feature", "TotalMatches").getValue()); } } if (matchPath != null) { String tempMatchesPath = jobOutputDir + Path.SEPARATOR + "_temp"; String matchOutputPath = matchPath + Path.SEPARATOR + "scan_result_" + scanPath.hashCode() + "_" + scanSince; FileUtil.copyMerge(fs, new Path(tempMatchesPath), fs, new Path(matchOutputPath), true, conf, null); LOG.info("The matches detected are stored in " + matchOutputPath); } if (maskPath != null) { LOG.info("The matches in the input are masked and a copy is kept under " + maskPath); } } } catch (IOException e) { LOG.error(e); } }
From source file:io.druid.indexer.IndexGeneratorJob.java
License:Apache License
public boolean run() { try {/*from w w w . j a v a 2 s. c o m*/ Job job = Job.getInstance(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); JobHelper.injectSystemProperties(job); config.addJobProperties(job); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(BytesWritable.class); SortableBytes.useSortableBytesAsMapOutputKey(job); int numReducers = Iterables.size(config.getAllBuckets().get()); if (numReducers == 0) { throw new RuntimeException("No buckets?? seems there is no data to index."); } if (config.getSchema().getTuningConfig().getUseCombiner()) { job.setCombinerClass(IndexGeneratorCombiner.class); job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class); } job.setNumReduceTasks(numReducers); job.setPartitionerClass(IndexGeneratorPartitioner.class); setReducerClass(job); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); // hack to get druid.processing.bitmap property passed down to hadoop job. // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig. final String bitmapProperty = "druid.processing.bitmap.type"; final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty); if (bitmapType != null) { for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) { // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above String value = Strings.nullToEmpty(job.getConfiguration().get(property)); job.getConfiguration().set(property, String.format("-D%s=%s %s", bitmapProperty, bitmapType, value)); } } config.intoConfiguration(job); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), job); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:io.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); }/*from w ww. j av a 2 s . c om*/ final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0);// Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }
From source file:ir.ac.ut.snl.mrcd.StageOne.java
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Job job; String input, output;/* w w w . j a v a 2s . c o m*/ int iterationCount = 0; long terminationValue = 1; boolean result = false; while (terminationValue > 0) { job = new Job(); if (iterationCount == 0) { input = args[0]; } else { input = args[1] + iterationCount; } output = args[1] + (iterationCount + 1); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setJarByClass(StageOne.class); job.setJobName("Stage one"); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapperClass(StageOneMapper.class); job.setReducerClass(StageOneReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ShortestPathTuple.class); result = job.waitForCompletion(true); Counters jobCounters = job.getCounters(); terminationValue = jobCounters.findCounter(StageOneCounter.ALL_ACTIVE).getValue(); iterationCount++; } return 0; }
From source file:ivory.app.TrecForwardIndexBuilder.java
License:Apache License
/** * Runs this tool./* www . ja v a2s . co m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path") .create(COLLECTION_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path") .create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data") .create(MAPPING_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexFile = cmdline.getOptionValue(INDEX_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_OPTION); String tmpDir = "tmp-" + TrecForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000); Configuration conf = getConf(); conf.set("mapreduce.map.memory.mb", "4096"); conf.set("mapreduce.map.java.opts", "-Xmx4096m"); Job job = new Job(conf, TrecForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath); job.setJarByClass(TrecForwardIndexBuilder.class); FileSystem fs = FileSystem.get(getConf()); LOG.info("Tool name: " + TrecForwardIndexBuilder.class.getSimpleName()); LOG.info(" - collection path: " + collectionPath); LOG.info(" - index file: " + indexFile); LOG.info(" - DocnoMapping file: " + mappingFile); LOG.info(" - temp output directory: " + tmpDir); job.setNumReduceTasks(1); if (job.getConfiguration().get("mapred.job.tracker").equals("local")) { job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile); } else { DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration()); } FileInputFormat.setInputPaths(job, new Path(collectionPath)); FileOutputFormat.setOutputPath(job, new Path(tmpDir)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(TrecDocumentInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(tmpDir), true); job.waitForCompletion(true); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Count.DOCS).getValue(); String inputFile = tmpDir + "/" + "part-r-00000"; LOG.info("Writing " + numDocs + " doc offseta to " + indexFile); LineReader reader = new LineReader(fs.open(new Path(inputFile))); FSDataOutputStream writer = fs.create(new Path(indexFile), true); writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName()); writer.writeUTF(collectionPath); writer.writeInt(numDocs); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\t"); long offset = Long.parseLong(arr[1]); int len = Integer.parseInt(arr[2]); writer.writeLong(offset); writer.writeInt(len); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " docs"); } } reader.close(); writer.close(); LOG.info(cnt + " docs total. Done!"); if (numDocs != cnt) { throw new RuntimeException("Unexpected number of documents in building forward index!"); } fs.delete(new Path(tmpDir), true); return 0; }
From source file:ivory.core.preprocess.BuildTermDocVectors.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); int numReducers = conf.getInt(Constants.TermDocVectorSegments, 0); LOG.info("PowerTool: " + BuildTermDocVectors.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!"); return 0; }//from www. ja v a 2s. c om DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job1 = new Job(conf, BuildTermDocVectors.class.getSimpleName() + ":" + collectionName); job1.setJarByClass(BuildTermDocVectors.class); job1.setNumReduceTasks(numReducers); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Write out number of postings. int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable:" + collectionName); job2.setJarByClass(BuildTermDocVectors.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:ivory.core.preprocess.ComputeGlobalTermStatistics.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int reduceTasks = 10; String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { LOG.info("index path doesn't existing: skipping!"); return 0; }//from w w w. ja va2 s. co m if (!fs.exists(new Path(termDocVectorsPath))) { LOG.info("term doc vectors path doesn't existing: skipping!"); return 0; } LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { LOG.info("TermDfCf directory exist: skipping!"); return 0; } Job job = new Job(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":" + collectionName); job.setJarByClass(ComputeGlobalTermStatistics.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfIntLong.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // Write out number of postings. NOTE: this value is not the same as // number of postings, because postings for non-English terms are // discarded, or as result of df cut. env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue()); env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue()); return 0; }