List of usage examples for org.apache.hadoop.mapreduce Job getCounters
public Counters getCounters() throws IOException
From source file:org.apache.sqoop.config.ConfigurationHelper.java
License:Apache License
/** * @return the number of mapper input records from a job using its counters. *///from w ww . j a va2s. c o m public static long getNumMapInputRecords(Job job) throws IOException, InterruptedException { return job.getCounters().findCounter(ConfigurationConstants.COUNTER_GROUP_MAPRED_TASK_COUNTERS, ConfigurationConstants.COUNTER_MAP_INPUT_RECORDS).getValue(); }
From source file:org.apache.sqoop.mapreduce.ExportJobBase.java
License:Apache License
@Override protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException { PerfCounters perfCounters = new PerfCounters(); perfCounters.startClock();//from w w w . j av a 2 s. c o m boolean success = doSubmitJob(job); perfCounters.stopClock(); Counters jobCounters = job.getCounters(); // If the job has been retired, these may be unavailable. if (null == jobCounters) { displayRetiredJobNotice(LOG); } else { perfCounters .addBytes(jobCounters.getGroup("FileSystemCounters").findCounter("HDFS_BYTES_READ").getValue()); LOG.info("Transferred " + perfCounters.toString()); long numRecords = ConfigurationHelper.getNumMapInputRecords(job); LOG.info("Exported " + numRecords + " records."); } return success; }
From source file:org.apache.sqoop.mapreduce.ImportJobBase.java
License:Apache License
/** * Actually run the MapReduce job.//from ww w . j av a2 s .co m */ @Override protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException { PerfCounters perfCounters = new PerfCounters(); perfCounters.startClock(); boolean success = doSubmitJob(job); if (isHCatJob) { SqoopHCatUtilities.instance().invokeOutputCommitterForLocalMode(job); } perfCounters.stopClock(); Counters jobCounters = job.getCounters(); // If the job has been retired, these may be unavailable. if (null == jobCounters) { displayRetiredJobNotice(LOG); } else { perfCounters.addBytes( jobCounters.getGroup("FileSystemCounters").findCounter("HDFS_BYTES_WRITTEN").getValue()); LOG.info("Transferred " + perfCounters.toString()); long numRecords = ConfigurationHelper.getNumMapOutputRecords(job); LOG.info("Retrieved " + numRecords + " records."); } return success; }
From source file:org.clueweb.clueweb09.app.CountWarcRecordsNew.java
License:Apache License
/** * Runs this tool.//from w ww . j av a 2 s .c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName()); LOG.info(" - input: " + input); Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input); job.setJarByClass(CountWarcRecordsNew.class); job.setNumReduceTasks(0); FileInputFormat.addInputPaths(job, input); job.setInputFormatClass(ClueWeb09InputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(MyMapper.class); job.waitForCompletion(true); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getValue(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.clueweb.clueweb09.app.DocumentExtractor.java
License:Apache License
/** * Runs this tool./*from w w w . ja v a 2 s. c o m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("docids file path").create(DOCIDS_FILE)); options.addOption( OptionBuilder.withArgName("true|false").hasArg().withDescription("keep HTML").create(KEEP_HTML)); options.addOption(OptionBuilder.withArgName("true|false").hasArg().withDescription("remove duplicates") .create(REMOVE_DUPLICATES)); options.addOption(OptionBuilder.withArgName("string " + HTMLParserFactory.getOptions()).hasArg() .withDescription("htmlParser").create(HTML_PARSER)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(KEEP_HTML) || !cmdline.hasOption(REMOVE_DUPLICATES)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); String docidsfile = cmdline.getOptionValue(DOCIDS_FILE); boolean keephtml = (cmdline.getOptionValue(KEEP_HTML).equals("true")) ? true : false; boolean removeDuplicates = (cmdline.getOptionValue(REMOVE_DUPLICATES).equals("true")) ? true : false; String htmlParser = (keephtml == false) ? cmdline.getOptionValue(HTML_PARSER) : ""; LOG.info("Tool name: " + DocumentExtractor.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); LOG.info(" - docidsfile: " + docidsfile); LOG.info(" - keephtml: " + keephtml); LOG.info(" - htmlParser: " + htmlParser); LOG.info(" - removeDuplicates: " + removeDuplicates); Configuration conf = getConf(); conf.set(DOCIDS_FILE, docidsfile); conf.setBoolean(KEEP_HTML, keephtml); conf.setBoolean(REMOVE_DUPLICATES, removeDuplicates); conf.set(OUTPUT_OPTION, output); conf.set(HTML_PARSER, htmlParser); Job job = new Job(getConf(), DocumentExtractor.class.getSimpleName() + ":" + input); job.setJarByClass(DocumentExtractor.class); FileInputFormat.setInputPaths(job, input); job.setInputFormatClass(ClueWeb09InputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int numDocsFound = (int) job.getCounters().findCounter(Records.DOCUMENTS_FOUND).getValue(); LOG.info("Number of documents found: " + numDocsFound); int numExceptions = (int) job.getCounters().findCounter(Records.HTML_PARSER_EXCEPTIONS).getValue(); LOG.info("Number of HTML parser exceptions: " + numExceptions); int numDocsWritten = (int) job.getCounters().findCounter(Records.DOCUMENTS_WRITTEN_TO_FILE).getValue(); LOG.info("Number of documents written to file: " + numDocsWritten); int numDuplicatesFound = (int) job.getCounters().findCounter(Records.DUPLICATES_FOUND).getValue(); LOG.info("Number of duplicates not written to file: " + numDuplicatesFound); return 0; }
From source file:org.clueweb.clueweb12.app.CountClueWarcRecordsNew.java
License:Apache License
/** * Runs this tool./*from w ww .jav a 2s. co m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + CountClueWarcRecordsNew.class.getSimpleName()); LOG.info(" - input: " + input); Job job = new Job(getConf(), CountClueWarcRecordsNew.class.getSimpleName() + ":" + input); job.setJarByClass(CountClueWarcRecordsNew.class); job.setNumReduceTasks(0); FileInputFormat.addInputPaths(job, input); job.setInputFormatClass(ClueWarcInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(MyMapper.class); job.waitForCompletion(true); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getValue(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.clueweb.clueweb12.app.CountWarcRecordsNew.java
License:Apache License
/** * Runs this tool.// ww w . j a va 2 s . c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName()); LOG.info(" - input: " + input); Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input); job.setJarByClass(CountWarcRecordsNew.class); job.setNumReduceTasks(0); FileInputFormat.addInputPaths(job, input); job.setInputFormatClass(ClueWeb12InputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(MyMapper.class); job.waitForCompletion(true); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getValue(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.clueweb.clueweb12.app.DocumentExtractor.java
License:Apache License
/** * Runs this tool.//from ww w . j a va 2s .co m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("docids file path").create(DOCIDS_FILE)); options.addOption( OptionBuilder.withArgName("true|false").hasArg().withDescription("keep HTML").create(KEEP_HTML)); options.addOption(OptionBuilder.withArgName("string " + HTMLParserFactory.getOptions()).hasArg() .withDescription("htmlParser").create(HTML_PARSER)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(KEEP_HTML)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); String docidsfile = cmdline.getOptionValue(DOCIDS_FILE); boolean keephtml = (cmdline.getOptionValue(KEEP_HTML).equals("true")) ? true : false; String htmlParser = (keephtml == false) ? cmdline.getOptionValue(HTML_PARSER) : ""; LOG.info("Tool name: " + DocumentExtractor.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); LOG.info(" - docidsfile: " + docidsfile); LOG.info(" - keephtml: " + keephtml); LOG.info(" - htmlParser: " + htmlParser); Configuration conf = getConf(); conf.set(DOCIDS_FILE, docidsfile); conf.setBoolean(KEEP_HTML, keephtml); conf.set(OUTPUT_OPTION, output); conf.set(HTML_PARSER, htmlParser); Job job = new Job(getConf(), DocumentExtractor.class.getSimpleName() + ":" + input); job.setJarByClass(DocumentExtractor.class); FileInputFormat.setInputPaths(job, input); job.setInputFormatClass(ClueWeb12InputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int numDocsFound = (int) job.getCounters().findCounter(Records.DOCUMENTS_FOUND).getValue(); LOG.info("Number of documents found: " + numDocsFound); int numExceptions = (int) job.getCounters().findCounter(Records.HTML_PARSER_EXCEPTIONS).getValue(); LOG.info("Number of HTML parser exceptions: " + numExceptions); return 0; }
From source file:org.clueweb.clueweb12.app.DuplicateFiltering.java
License:Apache License
/** * Runs this tool.// w w w.jav a 2s. c om */ @SuppressWarnings({ "static-access", "deprecation" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("input path (pfor format expected, add * to retrieve files)") .create(DOCVECTOR_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(TREC_RESULT_FILE)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION)); options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK)); options.addOption(OptionBuilder.withArgName("float [0-1]").hasArg() .withDescription("cosine similarity threshold").create(SIM_THRESHOLD)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(TREC_RESULT_FILE) || !cmdline.hasOption(SIM_THRESHOLD) || !cmdline.hasOption(TOPK)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION); String trecinput = cmdline.getOptionValue(TREC_RESULT_FILE); String output = cmdline.getOptionValue(OUTPUT_OPTION); String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION); String simThreshold = cmdline.getOptionValue(SIM_THRESHOLD); String topk = cmdline.getOptionValue(TOPK); LOG.info("Tool name: " + DuplicateFiltering.class.getSimpleName()); LOG.info(" - docvector: " + docvector); LOG.info(" - trecinputfile: " + trecinput); LOG.info(" - output: " + output); LOG.info(" - dictionary: " + dictionary); LOG.info(" - cosine similarity threshold: " + SIM_THRESHOLD); LOG.info(" - topk: " + topk); Configuration conf = getConf(); conf.set(DICTIONARY_OPTION, dictionary); conf.setFloat(SIM_THRESHOLD, Float.parseFloat(simThreshold)); conf.set(TREC_RESULT_FILE, trecinput); conf.setInt(TOPK, Integer.parseInt(topk)); conf.set("mapred.task.timeout", "6000000");// default is 600000 FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(output))) fs.delete(new Path(output)); Job job = new Job(conf, DuplicateFiltering.class.getSimpleName() + ":" + docvector); job.setJarByClass(DuplicateFiltering.class); FileInputFormat.setInputPaths(job, docvector); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(PairOfIntString.class); job.setMapOutputValueClass(FloatArrayWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setPartitionerClass(MyPartitioner.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int numDuplicates = (int) job.getCounters().findCounter(Records.DUPLICATES).getValue(); LOG.info("Number of duplicates: " + numDuplicates); return 0; }
From source file:org.culturegraph.mf.cluster.job.merge.Union.java
License:Apache License
@Override public int run(final String[] args) throws Exception { final String tmp = makeTmp(); if (!configChecker.logAndVerify(LOG, getConf())) { return -1; }/*from w w w . ja v a 2 s .co m*/ Job job; boolean ongoingMerges = true; job = new Job(getConf(), "initial explode"); job.setSpeculativeExecution(false); job.setJarByClass(Union.class); AbstractJobLauncher.configurePropertyTableMapper(job, getConf(), InputTableMapper.class, Text.class, TextArrayWritable.class); configureReducer(job, ExplodeReducer.class, new Path(tmp + "explode_0"), SequenceFileOutputFormat.class); job.setNumReduceTasks(2); job.waitForCompletion(true); int count = 0; while (ongoingMerges) { job = new Job(getConf(), "recollect"); job.setJarByClass(Union.class); configureMapper(job, RecollectMapper.class, new Path(tmp + "explode_" + count), SequenceFileInputFormat.class); configureReducer(job, RecollectReducer.class, new Path(tmp + "recollect_" + count), SequenceFileOutputFormat.class); job.setNumReduceTasks(2); job.waitForCompletion(true); job = new Job(getConf(), "explode"); job.setJarByClass(Union.class); configureMapper(job, ExplodeMapper.class, new Path(tmp + "recollect_" + count), SequenceFileInputFormat.class); ++count; configureReducer(job, ExplodeReducer.class, new Path(tmp + "explode_" + count), SequenceFileOutputFormat.class); job.setNumReduceTasks(2); job.waitForCompletion(true); ongoingMerges = job.getCounters().getGroup(UNION_FIND).findCounter(OPEN_CLASSES).getValue() != 0; LOG.info("ongoingMerges=" + ongoingMerges); } job = new Job(HBaseConfiguration.create(getConf()), "collect result"); job.setSpeculativeExecution(false); job.setJarByClass(Union.class); final Path path = new Path(tmp + "recollect_*"); FileInputFormat.addInputPath(job, path); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(ResultMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(NullOutputFormat.class); job.waitForCompletion(true); return 1; }