Example usage for org.apache.hadoop.mapreduce Job getCounters

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getCounters.

Prototype

public Counters getCounters() throws IOException

Source Link

Document

Gets the counters for this job.

Usage

From source file:nl.cwi.kba2013.apps.FeatureExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from   w ww . j a  v  a2 s.  c om*/
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(FeatureExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    //      Text line = new Text();
    //      LineReader reader = new LineReader(fs.open(new Path(out
    //            + "/part-r-00000")));
    //      for (int i = 0; i < num_filter_results; i++) {
    //         reader.readLine(line);
    //         System.out.println(line.toString().split("\t\t")[1]);
    //      }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.FeatureExtractor_DocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from   w w w.  ja va  2  s  .  co  m*/
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(FeatureExtractor_DocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    /*
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
            
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
            
    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);
            
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
            
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    */

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(TextOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(StreamItemWritable.class);
    //job.setMapOutputValueClass(Text.class);
    //job.setOutputKeyClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    // Text line = new Text();
    // LineReader reader = new LineReader(fs.open(new Path(out
    // + "/part-r-00000")));
    // for (int i = 0; i < num_filter_results; i++) {
    // reader.readLine(line);
    // System.out.println(line.toString().split("\t\t")[1]);
    // }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.KBaDocExtractorFromCleansed.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from  w ww  .  j av  a  2 s  . c o m*/
    String out = null;
    String traintestFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-t".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    Configuration conf = getConf();
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Annotation Extraction");
    job.setJarByClass(KBaDocExtractorFromCleansed.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);

    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StreamItemWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();

    return status;

}

From source file:nl.cwi.kba2013.apps.KbaNameVariantMatchFeatureExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/* w w w  .  ja va2 s.c  o m*/
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    //      LOG.info("Tool: " + this.getClass().getName());
    //      LOG.info(" - input path: " + in);
    //      LOG.info(" - output path: " + out);
    //      LOG.info(" - runtag: " + runtag);
    //      LOG.info(" - teamname: " + teamname);
    //      LOG.info(" - corpus_id: " + corpus_id);
    //      LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KbaNameVariantMatchFeatureExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    /*
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
            
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
            
    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);
            
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
            
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    */

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(TextOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(StreamItemWritable.class);
    //job.setMapOutputValueClass(Text.class);
    //job.setOutputKeyClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    // Text line = new Text();
    // LineReader reader = new LineReader(fs.open(new Path(out
    // + "/part-r-00000")));
    // for (int i = 0; i < num_filter_results; i++) {
    // reader.readLine(line);
    // System.out.println(line.toString().split("\t\t")[1]);
    // }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.ReadGzip.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from  ww w.j  a v a 2  s  .c o m
    String out = null;

    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(ReadGzip.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    return status;

}

From source file:nl.cwi.wikilink.apps.WikiLinkContextExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from  ww  w.ja va2 s . c  o m*/
    String out = null;
    String queryFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - query file: " + queryFile);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryFile).toUri().toString());

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    // loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "WikiLinks");
    job.setJarByClass(WikiLinkContextExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(queryFile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    // add some more statistics
    Counters c = job.getCounters();

    return status;

}

From source file:nl.cwi.wikilink.apps.WikiLinkContextToy.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//  w  w w  .  j  ava2  s .  co m
    String out = null;
    String queryFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - query file: " + queryFile);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryFile).toUri().toString());

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    // loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "WikiLinks");
    job.setJarByClass(WikiLinkContextToy.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(queryFile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    // add some more statistics
    Counters c = job.getCounters();

    return status;

}

From source file:org.acacia.csr.java.LineCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*/*from   w w w. j  a  va  2s.  c  om*/
      String dir1 = "/user/miyuru/wcout";
      String dir2 = "/user/miyuru/lcout";
       //We first delete the temporary directories if they exist on the HDFS
        FileSystem fs1 = FileSystem.get(new JobConf());
                
       if(fs1.exists(new Path(dir2))){
          fs1.delete(new Path(dir2), true);
       }
            
        JobConf conf = new JobConf(LineCount.class);
        conf.setJobName("LineCount");
               
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(IntWritable.class);
               
        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);
               
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
               
        FileInputFormat.setInputPaths(conf, new Path(dir1));
        FileOutputFormat.setOutputPath(conf, new Path(dir2));
               
        Job job = new Job(conf, "line count");
        job.waitForCompletion(true); 
        org.apache.hadoop.mapreduce.Counters cntr = job.getCounters();
        System .out.println("Number of lines in the file" + cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue());
        */

    long edgeCount = 0;
    //String dir3 = "/user/miyuru/wcout";
    String dir4 = "/user/miyuru/lcout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs2 = FileSystem.get(new JobConf());

    if (fs2.exists(new Path(dir4))) {
        fs2.delete(new Path(dir4), true);
    }

    JobConf conf1 = new JobConf(LineCount.class);
    conf1.setJobName("LineCount");

    conf1.setOutputKeyClass(Text.class);
    conf1.setOutputValueClass(IntWritable.class);

    conf1.setMapperClass(Map.class);
    conf1.setCombinerClass(Reduce.class);
    conf1.setReducerClass(Reduce.class);

    conf1.setInputFormat(TextInputFormat.class);
    conf1.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf1, new Path(dir5));
    FileOutputFormat.setOutputPath(conf1, new Path(dir4));

    Job job1 = new Job(conf1, "line count");
    job1.setNumReduceTasks(0);
    job1.waitForCompletion(true);
    org.apache.hadoop.mapreduce.Counters cntr = job1.getCounters();
    edgeCount = cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();

    File efile = new File("/tmp/efile");

    if (efile.exists()) {
        efile.delete();
    }

    PrintWriter writer = new PrintWriter("/tmp/efile", "UTF-8");
    writer.println(edgeCount);
    writer.flush();
    writer.close();

    //edgeCount = edgeCount -1;//This is to remove the line number additionlly added to each edgelist file by HDFS. This is strange, but it happens.
    System.out.println("======>Edge count is : " + edgeCount);
    System.out.println("------Done Line Count---------------");
}

From source file:org.ankus.mapreduce.algorithms.statistics.nominalstats.NominalStatsDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//w ww . j a  v  a  2s  .  co  m
     * 1st Job - Frequency Computation (MR)
     * 2nd Job - Ratio Computation (By Total Record Count, Map Only)
     */
    logger.info("Nominal Statistics MR-Job is Started..");

    Configuration conf = new Configuration();
    if (!ConfigurationVariable.setFromArguments(args, conf)) {
        logger.error("MR Job Setting Failed..");
        Usage.printUsage(Constants.ALGORITHM_NOMINAL_STATS);
        logger.info("Error: MR Job Setting Failed..: Configuration Failed");
        return 1;
    }

    String tempStr = "_freqs";

    logger.info("1st-Step of MR-Job is Started..");

    Job job1 = new Job();
    set2StepJob1(job1, conf, tempStr);
    job1.setJarByClass(NominalStatsDriver.class);

    job1.setMapperClass(NominalStatsFrequencyMapper.class);
    job1.setReducerClass(NominalStatsFrequencyReducer.class);

    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(IntWritable.class);

    job1.setOutputKeyClass(NullWritable.class);
    job1.setOutputValueClass(Text.class);

    if (!job1.waitForCompletion(true)) {
        logger.error("Error: MR(Step-1) for Nominal Stats is not Completion");
        logger.info("MR-Job is Failed..");
        return 1;
    }

    logger.info("1st-Step of MR-Job is successfully finished..");
    logger.info("2nd-Step of MR-Job is Started..");

    long mapOutCnt = job1.getCounters()
            .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_OUTPUT_RECORDS").getValue();

    Job job2 = new Job();
    set2StepJob2(job2, conf, tempStr, mapOutCnt);
    job2.setJarByClass(NominalStatsDriver.class);

    job2.setMapperClass(NominalStatsRatioMapper.class);

    job2.setMapOutputKeyClass(NullWritable.class);
    job2.setMapOutputValueClass(Text.class);

    job2.setNumReduceTasks(0);

    if (!job2.waitForCompletion(true)) {
        logger.error("Error: MR(Step-2) for Nominal Stats is not Completeion");
        logger.info("MR-Job is Failed..");
        return 1;
    }

    // temp deletetion
    if (conf.get(ArgumentsConstants.TEMP_DELETE, "true").equals("true")) {
        logger.info("Temporary Files are Deleted..: " + conf.get(ArgumentsConstants.OUTPUT_PATH) + tempStr);
        FileSystem.get(conf).delete(new Path(conf.get(ArgumentsConstants.OUTPUT_PATH) + tempStr), true);
    }
    logger.info("MR-Job is successfully finished..");
    return 0;
}

From source file:org.apache.bigtop.itest.hbase.system.TestLoadAndVerify.java

License:Apache License

private void doVerify(Configuration conf, HTableDescriptor htd) throws Exception {
    Path outputDir = new Path(HBaseTestUtil.getMROutputDir(TEST_NAME), "verify-output");

    Job job = new Job(conf);
    job.setJarByClass(this.getClass());
    job.setJobName(TEST_NAME + " Verification for " + htd.getNameAsString());

    Scan scan = new Scan();

    TableMapReduceUtil.initTableMapperJob(htd.getNameAsString(), scan, VerifyMapper.class, BytesWritable.class,
            BytesWritable.class, job);
    int scannerCaching = conf.getInt("verify.scannercaching", SCANNER_CACHING);
    TableMapReduceUtil.setScannerCaching(job, SCANNER_CACHING);

    job.setReducerClass(VerifyReducer.class);
    job.setNumReduceTasks(NUM_REDUCE_TASKS);
    FileOutputFormat.setOutputPath(job, outputDir);
    assertTrue(job.waitForCompletion(true));

    long numOutputRecords = job.getCounters().findCounter(TaskCounter.REDUCE_OUTPUT_RECORDS).getValue();
    assertEquals(0, numOutputRecords);// w ww  .  j  a va 2  s  .c o m
}