Example usage for org.apache.hadoop.mapreduce Job getCounters

List of usage examples for org.apache.hadoop.mapreduce Job getCounters

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getCounters.

Prototype

public Counters getCounters() throws IOException 

Source Link

Document

Gets the counters for this job.

Usage

From source file:org.apache.sqoop.config.ConfigurationHelper.java

License:Apache License

/**
 * @return the number of mapper input records from a job using its counters.
 *///from   w  ww . j a va2s. c  o  m
public static long getNumMapInputRecords(Job job) throws IOException, InterruptedException {
    return job.getCounters().findCounter(ConfigurationConstants.COUNTER_GROUP_MAPRED_TASK_COUNTERS,
            ConfigurationConstants.COUNTER_MAP_INPUT_RECORDS).getValue();
}

From source file:org.apache.sqoop.mapreduce.ExportJobBase.java

License:Apache License

@Override
protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {

    PerfCounters perfCounters = new PerfCounters();
    perfCounters.startClock();//from w w w . j av a 2 s. c  o m

    boolean success = doSubmitJob(job);
    perfCounters.stopClock();

    Counters jobCounters = job.getCounters();
    // If the job has been retired, these may be unavailable.
    if (null == jobCounters) {
        displayRetiredJobNotice(LOG);
    } else {
        perfCounters
                .addBytes(jobCounters.getGroup("FileSystemCounters").findCounter("HDFS_BYTES_READ").getValue());
        LOG.info("Transferred " + perfCounters.toString());
        long numRecords = ConfigurationHelper.getNumMapInputRecords(job);
        LOG.info("Exported " + numRecords + " records.");
    }

    return success;
}

From source file:org.apache.sqoop.mapreduce.ImportJobBase.java

License:Apache License

/**
 * Actually run the MapReduce job.//from ww w .  j  av  a2  s .co  m
 */
@Override
protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {

    PerfCounters perfCounters = new PerfCounters();
    perfCounters.startClock();

    boolean success = doSubmitJob(job);

    if (isHCatJob) {
        SqoopHCatUtilities.instance().invokeOutputCommitterForLocalMode(job);
    }

    perfCounters.stopClock();

    Counters jobCounters = job.getCounters();
    // If the job has been retired, these may be unavailable.
    if (null == jobCounters) {
        displayRetiredJobNotice(LOG);
    } else {
        perfCounters.addBytes(
                jobCounters.getGroup("FileSystemCounters").findCounter("HDFS_BYTES_WRITTEN").getValue());
        LOG.info("Transferred " + perfCounters.toString());
        long numRecords = ConfigurationHelper.getNumMapOutputRecords(job);
        LOG.info("Retrieved " + numRecords + " records.");
    }
    return success;
}

From source file:org.clueweb.clueweb09.app.CountWarcRecordsNew.java

License:Apache License

/**
 * Runs this tool.//from w ww . j av a  2  s .c o m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
    LOG.info(" - input: " + input);

    Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input);
    job.setJarByClass(CountWarcRecordsNew.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(job, input);

    job.setInputFormatClass(ClueWeb09InputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(MyMapper.class);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
    LOG.info("Read " + numDocs + " docs.");

    return 0;
}

From source file:org.clueweb.clueweb09.app.DocumentExtractor.java

License:Apache License

/**
 * Runs this tool./*from w w  w  . ja  v  a  2 s. c o m*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("docids file path").create(DOCIDS_FILE));
    options.addOption(
            OptionBuilder.withArgName("true|false").hasArg().withDescription("keep HTML").create(KEEP_HTML));
    options.addOption(OptionBuilder.withArgName("true|false").hasArg().withDescription("remove duplicates")
            .create(REMOVE_DUPLICATES));
    options.addOption(OptionBuilder.withArgName("string " + HTMLParserFactory.getOptions()).hasArg()
            .withDescription("htmlParser").create(HTML_PARSER));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(KEEP_HTML)
            || !cmdline.hasOption(REMOVE_DUPLICATES)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String docidsfile = cmdline.getOptionValue(DOCIDS_FILE);
    boolean keephtml = (cmdline.getOptionValue(KEEP_HTML).equals("true")) ? true : false;
    boolean removeDuplicates = (cmdline.getOptionValue(REMOVE_DUPLICATES).equals("true")) ? true : false;
    String htmlParser = (keephtml == false) ? cmdline.getOptionValue(HTML_PARSER) : "";

    LOG.info("Tool name: " + DocumentExtractor.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);
    LOG.info(" - docidsfile: " + docidsfile);
    LOG.info(" - keephtml: " + keephtml);
    LOG.info(" - htmlParser: " + htmlParser);
    LOG.info(" - removeDuplicates: " + removeDuplicates);

    Configuration conf = getConf();
    conf.set(DOCIDS_FILE, docidsfile);
    conf.setBoolean(KEEP_HTML, keephtml);
    conf.setBoolean(REMOVE_DUPLICATES, removeDuplicates);
    conf.set(OUTPUT_OPTION, output);
    conf.set(HTML_PARSER, htmlParser);

    Job job = new Job(getConf(), DocumentExtractor.class.getSimpleName() + ":" + input);
    job.setJarByClass(DocumentExtractor.class);

    FileInputFormat.setInputPaths(job, input);

    job.setInputFormatClass(ClueWeb09InputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setNumReduceTasks(0);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    int numDocsFound = (int) job.getCounters().findCounter(Records.DOCUMENTS_FOUND).getValue();
    LOG.info("Number of documents found: " + numDocsFound);

    int numExceptions = (int) job.getCounters().findCounter(Records.HTML_PARSER_EXCEPTIONS).getValue();
    LOG.info("Number of HTML parser exceptions: " + numExceptions);

    int numDocsWritten = (int) job.getCounters().findCounter(Records.DOCUMENTS_WRITTEN_TO_FILE).getValue();
    LOG.info("Number of documents written to file: " + numDocsWritten);

    int numDuplicatesFound = (int) job.getCounters().findCounter(Records.DUPLICATES_FOUND).getValue();
    LOG.info("Number of duplicates not written to file: " + numDuplicatesFound);

    return 0;
}

From source file:org.clueweb.clueweb12.app.CountClueWarcRecordsNew.java

License:Apache License

/**
 * Runs this tool./*from   w  ww  .jav  a  2s. co  m*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + CountClueWarcRecordsNew.class.getSimpleName());
    LOG.info(" - input: " + input);

    Job job = new Job(getConf(), CountClueWarcRecordsNew.class.getSimpleName() + ":" + input);
    job.setJarByClass(CountClueWarcRecordsNew.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(job, input);

    job.setInputFormatClass(ClueWarcInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(MyMapper.class);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
    LOG.info("Read " + numDocs + " docs.");

    return 0;
}

From source file:org.clueweb.clueweb12.app.CountWarcRecordsNew.java

License:Apache License

/**
 * Runs this tool.// ww  w .  j a va 2  s  . c  o m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
    LOG.info(" - input: " + input);

    Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input);
    job.setJarByClass(CountWarcRecordsNew.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(job, input);

    job.setInputFormatClass(ClueWeb12InputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(MyMapper.class);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
    LOG.info("Read " + numDocs + " docs.");

    return 0;
}

From source file:org.clueweb.clueweb12.app.DocumentExtractor.java

License:Apache License

/**
 * Runs this tool.//from  ww w . j a  va  2s .co  m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("docids file path").create(DOCIDS_FILE));
    options.addOption(
            OptionBuilder.withArgName("true|false").hasArg().withDescription("keep HTML").create(KEEP_HTML));
    options.addOption(OptionBuilder.withArgName("string " + HTMLParserFactory.getOptions()).hasArg()
            .withDescription("htmlParser").create(HTML_PARSER));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(KEEP_HTML)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String docidsfile = cmdline.getOptionValue(DOCIDS_FILE);
    boolean keephtml = (cmdline.getOptionValue(KEEP_HTML).equals("true")) ? true : false;
    String htmlParser = (keephtml == false) ? cmdline.getOptionValue(HTML_PARSER) : "";

    LOG.info("Tool name: " + DocumentExtractor.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);
    LOG.info(" - docidsfile: " + docidsfile);
    LOG.info(" - keephtml: " + keephtml);
    LOG.info(" - htmlParser: " + htmlParser);

    Configuration conf = getConf();
    conf.set(DOCIDS_FILE, docidsfile);
    conf.setBoolean(KEEP_HTML, keephtml);
    conf.set(OUTPUT_OPTION, output);
    conf.set(HTML_PARSER, htmlParser);

    Job job = new Job(getConf(), DocumentExtractor.class.getSimpleName() + ":" + input);
    job.setJarByClass(DocumentExtractor.class);

    FileInputFormat.setInputPaths(job, input);

    job.setInputFormatClass(ClueWeb12InputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setNumReduceTasks(0);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    int numDocsFound = (int) job.getCounters().findCounter(Records.DOCUMENTS_FOUND).getValue();
    LOG.info("Number of documents found: " + numDocsFound);

    int numExceptions = (int) job.getCounters().findCounter(Records.HTML_PARSER_EXCEPTIONS).getValue();
    LOG.info("Number of HTML parser exceptions: " + numExceptions);

    return 0;
}

From source file:org.clueweb.clueweb12.app.DuplicateFiltering.java

License:Apache License

/**
 * Runs this tool.// w  w  w.jav a  2s.  c om
 */
@SuppressWarnings({ "static-access", "deprecation" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("input path (pfor format expected, add * to retrieve files)")
            .create(DOCVECTOR_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(TREC_RESULT_FILE));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION));
    options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK));
    options.addOption(OptionBuilder.withArgName("float [0-1]").hasArg()
            .withDescription("cosine similarity threshold").create(SIM_THRESHOLD));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(TREC_RESULT_FILE)
            || !cmdline.hasOption(SIM_THRESHOLD) || !cmdline.hasOption(TOPK)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION);
    String trecinput = cmdline.getOptionValue(TREC_RESULT_FILE);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
    String simThreshold = cmdline.getOptionValue(SIM_THRESHOLD);
    String topk = cmdline.getOptionValue(TOPK);

    LOG.info("Tool name: " + DuplicateFiltering.class.getSimpleName());
    LOG.info(" - docvector: " + docvector);
    LOG.info(" - trecinputfile: " + trecinput);
    LOG.info(" - output: " + output);
    LOG.info(" - dictionary: " + dictionary);
    LOG.info(" - cosine similarity threshold: " + SIM_THRESHOLD);
    LOG.info(" - topk: " + topk);

    Configuration conf = getConf();
    conf.set(DICTIONARY_OPTION, dictionary);
    conf.setFloat(SIM_THRESHOLD, Float.parseFloat(simThreshold));
    conf.set(TREC_RESULT_FILE, trecinput);
    conf.setInt(TOPK, Integer.parseInt(topk));

    conf.set("mapred.task.timeout", "6000000");// default is 600000

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(output)))
        fs.delete(new Path(output));

    Job job = new Job(conf, DuplicateFiltering.class.getSimpleName() + ":" + docvector);
    job.setJarByClass(DuplicateFiltering.class);

    FileInputFormat.setInputPaths(job, docvector);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(PairOfIntString.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    int numDuplicates = (int) job.getCounters().findCounter(Records.DUPLICATES).getValue();
    LOG.info("Number of duplicates: " + numDuplicates);

    return 0;
}

From source file:org.culturegraph.mf.cluster.job.merge.Union.java

License:Apache License

@Override
public int run(final String[] args) throws Exception {
    final String tmp = makeTmp();

    if (!configChecker.logAndVerify(LOG, getConf())) {
        return -1;
    }/*from w w  w . ja v  a 2 s .co  m*/

    Job job;
    boolean ongoingMerges = true;

    job = new Job(getConf(), "initial explode");
    job.setSpeculativeExecution(false);
    job.setJarByClass(Union.class);
    AbstractJobLauncher.configurePropertyTableMapper(job, getConf(), InputTableMapper.class, Text.class,
            TextArrayWritable.class);
    configureReducer(job, ExplodeReducer.class, new Path(tmp + "explode_0"), SequenceFileOutputFormat.class);
    job.setNumReduceTasks(2);
    job.waitForCompletion(true);

    int count = 0;
    while (ongoingMerges) {

        job = new Job(getConf(), "recollect");

        job.setJarByClass(Union.class);
        configureMapper(job, RecollectMapper.class, new Path(tmp + "explode_" + count),
                SequenceFileInputFormat.class);

        configureReducer(job, RecollectReducer.class, new Path(tmp + "recollect_" + count),
                SequenceFileOutputFormat.class);
        job.setNumReduceTasks(2);
        job.waitForCompletion(true);

        job = new Job(getConf(), "explode");
        job.setJarByClass(Union.class);
        configureMapper(job, ExplodeMapper.class, new Path(tmp + "recollect_" + count),
                SequenceFileInputFormat.class);
        ++count;
        configureReducer(job, ExplodeReducer.class, new Path(tmp + "explode_" + count),
                SequenceFileOutputFormat.class);
        job.setNumReduceTasks(2);
        job.waitForCompletion(true);

        ongoingMerges = job.getCounters().getGroup(UNION_FIND).findCounter(OPEN_CLASSES).getValue() != 0;
        LOG.info("ongoingMerges=" + ongoingMerges);
    }

    job = new Job(HBaseConfiguration.create(getConf()), "collect result");
    job.setSpeculativeExecution(false);
    job.setJarByClass(Union.class);
    final Path path = new Path(tmp + "recollect_*");
    FileInputFormat.addInputPath(job, path);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(ResultMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.waitForCompletion(true);

    return 1;
}