Example usage for org.apache.hadoop.mapreduce Job getCounters

List of usage examples for org.apache.hadoop.mapreduce Job getCounters

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getCounters.

Prototype

public Counters getCounters() throws IOException 

Source Link

Document

Gets the counters for this job.

Usage

From source file:net.broomie.WordCoCounter.java

License:Apache License

/**
 * This method is implement for creating the dfdb with MapReduce.
 * @param conf Specify the conf object, which is hadoop Configuration.
 * @param dfdb Specify the dfdb directory path on HDFS.
 * @return Return `true' if success, return `false' if fail.
 * @throws IOException Exception for a input file IO.
 * @throws InterruptedException Exception for return waitForCompletion().
 * @throws ClassNotFoundException Exception for Mapper and Reduce class.
 * @throws URISyntaxException Exception for new URI().
 * The dfdb means `document frequency'.// w ww .  java 2  s .  c o m
 */
private boolean runWordCount(Configuration conf, String dfdb)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    String reducerNum = conf.get(WORD_CO_COUNTER_REDUCER_NUM);
    Job job = new Job(conf);
    job.setJarByClass(WordCoCounter.class);
    TextInputFormat.addInputPath(job, new Path(in));
    FileSystem fs = FileSystem.get(new URI(dfdb), conf);
    FileStatus[] status = fs.listStatus(new Path(dfdb));
    if (status != null) {
        fs.delete(new Path(dfdb), true);
    }
    fs.close();
    FileOutputFormat.setOutputPath(job, new Path(dfdb));
    //job.setMapperClass(TokenizeMapper.class);
    job.setMapperClass(DFMapper.class);
    job.setReducerClass(TokenizeReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    //job.setNumReduceTasks(Integer.valueOf(reducerNum));
    job.setNumReduceTasks(Integer.valueOf(8));
    boolean rv = job.waitForCompletion(true);
    if (rv) {
        Counters counters = job.getCounters();
        long inputNum = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
                .getValue();
        FileSystem hdfs = FileSystem.get(conf);
        String numLinePath = conf.get(PROP_LINE_NUM);
        FSDataOutputStream stream = hdfs.create(new Path(numLinePath));
        stream.writeUTF(String.valueOf((int) inputNum));
        stream.close();
    }
    return rv;
}

From source file:net.java.jatextmining.JaCoOccurrence.java

License:Apache License

/**
 * Writing the numf of line of input file on HDFS.
 * @param conf Spefity the Hadoop Configuration object.
 * @param job Specify the Hadoop Job object.
 * @return if success return true, not success return false.
 * @throws IOException Exception for IO.
 *//* w  w  w.j  av a 2  s . c  o  m*/
private boolean writeDocNumFile(Configuration conf, Job job) throws IOException {
    Counters counters = job.getCounters();
    inputNum = (int) counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
            .getValue();
    if (inputNum == 0) {
        return false;
    }
    FileSystem hdfs = FileSystem.get(conf);
    String docNumPath = conf.get("jatextmining.docNumPath");
    if (docNumPath == null) {
        return false;
    }
    FSDataOutputStream stream = hdfs.create(new Path(docNumPath));
    stream.writeUTF(String.valueOf((int) inputNum));
    stream.close();

    return true;
}

From source file:net.java.jatextmining.JaWordCounter.java

License:Apache License

/**
 * Creating the DF database from Japanese documents.
 * @param conf Specify the Hadoop Configuration object.
 * @param dfdb Specify the saving path for DF database.
 * @return If success return true, it not success return false.
 * @throws IOException Exception for IO.
 * @throws URISyntaxException Exception for DF database URI.
 * @throws InterruptedException Exception for waitForCompletion().
 * @throws ClassNotFoundException Exception for waitForCompletion().
 */// w ww  .j  a v a 2  s .  c om
private boolean runCreateDFDB(Configuration conf, String dfdb)
        throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
    String reducerNum = conf.get("jatextmining.JaWordCounterReducerNum");
    Job job = new Job(conf);
    job.setJarByClass(JaWordCounter.class);
    TextInputFormat.addInputPath(job, new Path(dfIn));
    FileOutputFormat.setOutputPath(job, new Path(dfdb));
    FileSystem fs = FileSystem.get(new URI(dfdb), conf);
    FileStatus[] status = fs.listStatus(new Path(dfdb));
    if (status != null) {
        fs.delete(new Path(dfdb), true);
    }
    fs.close();
    job.setMapperClass(CountMapper.class);
    job.setReducerClass(CountReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setNumReduceTasks(Integer.valueOf(reducerNum));
    boolean rv = job.waitForCompletion(true);
    if (rv) {
        Counters counters = job.getCounters();
        long docNum = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
                .getValue();
        FileSystem hdfs = FileSystem.get(conf);
        String docNumPath = conf.get("jatextmining.docNum");
        FSDataOutputStream stream = hdfs.create(new Path(docNumPath));
        stream.writeUTF(String.valueOf((int) docNum));
        stream.close();
    }

    return rv;
}

From source file:nl.cwi.hadoop.kba.stat.ToyKbaDocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//  w w  w .  j a va2  s . co  m
    String out = null;
    String queryfile = null;
    String systemdescription = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-c".equals(args[i])) {
                corpus_id = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Toy KBA system");
    job.setJarByClass(ToyKbaDocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StreamItemWritable.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    /*
    for (String g : job.getCounters().getGroupNames()) {
            
      Iterator<org.apache.hadoop.mapreduce.Counter> it = job.getCounters()
          .getGroup(g).iterator();
            
      LOG.info(g + "\t" + job.getCounters().getGroup(g).getDisplayName());
            
      while (it.hasNext()) {
        org.apache.hadoop.mapreduce.Counter c = it.next();
        LOG.info("\t" + c.getDisplayName() + "\t" + c.getValue());
      }
    }
    */

    // add some more statistics
    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Filter_run.Factory().toJSON(fr));

    Text line = new Text();
    LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000")));
    for (int i = 0; i < num_filter_results; i++) {
        reader.readLine(line);
        System.out.println(line.toString());
    }

    System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.hadoop.kba.stat.ToyKbaSystem.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from   ww w . j  a  v a  2 s  .co  m
    String out = null;
    String queryfile = null;
    String systemdescription = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-c".equals(args[i])) {
                corpus_id = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Toy KBA system");
    job.setJarByClass(ToyKbaSystem.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringLongPair.class);

    // job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    /*
    for (String g : job.getCounters().getGroupNames()) {
            
      Iterator<org.apache.hadoop.mapreduce.Counter> it = job.getCounters()
          .getGroup(g).iterator();
            
      LOG.info(g + "\t" + job.getCounters().getGroup(g).getDisplayName());
            
      while (it.hasNext()) {
        org.apache.hadoop.mapreduce.Counter c = it.next();
        LOG.info("\t" + c.getDisplayName() + "\t" + c.getValue());
      }
    }
    */

    // add some more statistics
    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Filter_run.Factory().toJSON(fr));

    Text line = new Text();
    LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000")));
    for (int i = 0; i < num_filter_results; i++) {
        reader.readLine(line);
        System.out.println(line.toString());
    }

    System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba.apps.EntitySurfaceForms.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//  ww w . j  a  v  a2 s .c  om
    String out = null;
    String queryfile = null;
    String systemdescription = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;

    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];
            } else if ("-c".equals(args[i])) {
                corpus_id = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Toy KBA system");
    job.setJarByClass(EntitySurfaceForms.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Filter_run.Factory().toJSON(fr));

    Text line = new Text();
    LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000")));
    for (int i = 0; i < num_filter_results; i++) {
        reader.readLine(line);
        System.out.println(line.toString());
    }

    System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba.apps.FeatureExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//  w  w  w.  j  a va2 s . c  o  m
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
    Attr.put("LengthTitle", "");
    Attr.put("LengthBody", "");
    Attr.put("LengthAnchor", "");
    Attr.put("Source", "");
    Attr.put("English", "");
    Attr.put("MentionsTitle", "");
    Attr.put("MentionsBody", "");
    Attr.put("MentionsAnchor", "");
    Attr.put("FirstPos", "");
    Attr.put("LastPos", "");
    Attr.put("Spread", "");
    Attr.put("FirstPosNorm", "");
    Attr.put("LastPosNorm", "");
    Attr.put("SpreadNorm", "");
    // Attr.put("Related", "");
    Attr.put("Relatedtitle", "");
    Attr.put("RelatedBody", "");
    Attr.put("RelatedAnchor", "");
    Attr.put("ppr", "");
    Attr.put("gcld", "");
    Attr.put("partial", "");
    Attr.put("s_form", "");
    Attr.put("contxL", "0");
    Attr.put("contxR", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("jac", "0");
    Attr.put("Class", "");
    */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    //conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(contextFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(FeatureExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());

    // DistributedCache.addCacheFile( new URI(new Path(myverFile) + "#" +
    //MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    // System.out.println("#" + new Filter_run.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    Text line = new Text();
    LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000")));
    for (int i = 0; i < num_filter_results; i++) {
        reader.readLine(line);
        System.out.println(line.toString().split("\t\t")[1]);
    }
    /*
     * System.out.println("#" + new
     * Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));
     */

    return status;

}

From source file:nl.cwi.kba.apps.FeatureExtractor_filterer.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from w  w  w  . j  ava 2  s  . c  om*/
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];

            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    Attr.put("LengthTitle", "");
    Attr.put("LengthBody", "");
    Attr.put("LengthAnchor", "");
    Attr.put("Source", "");
    Attr.put("English", "");
    Attr.put("MentionsTitle", "");
    Attr.put("MentionsBody", "");
    Attr.put("MentionsAnchor", "");
    Attr.put("FirstPos", "");
    Attr.put("LastPos", "");
    Attr.put("Spread", "");
    Attr.put("FirstPosNorm", "");
    Attr.put("LastPosNorm", "");
    Attr.put("SpreadNorm", "");
    // Attr.put("Related", "");
    Attr.put("Relatedtitle", "");
    Attr.put("RelatedBody", "");
    Attr.put("RelatedAnchor", "");

    //Attr.put("contxL", "0");
    //Attr.put("contxR", "0");
    Attr.put("Class", "");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());

    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    //set time
    conf.setLong("mapred.task.timeout", 40 * 600000);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Toy KBA system");
    job.setJarByClass(FeatureExtractor_filterer.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    // System.out.println("#" + new Filter_run.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    Text line = new Text();
    LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000")));
    for (int i = 0; i < num_filter_results; i++) {
        reader.readLine(line);
        System.out.println(line.toString().split("\t\t")[1]);
    }

    return status;

}

From source file:nl.cwi.kba.apps.KbaDocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//www.  j  a  va 2s  .c o  m
    String out = null;
    String queryfile = null;
    String systemdescription = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-c".equals(args[i])) {
                corpus_id = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Toy KBA system");
    job.setJarByClass(KbaDocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StreamItemWritable.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    /*
    for (String g : job.getCounters().getGroupNames()) {
            
      Iterator<org.apache.hadoop.mapreduce.Counter> it = job.getCounters()
          .getGroup(g).iterator();
            
      LOG.info(g + "\t" + job.getCounters().getGroup(g).getDisplayName());
            
      while (it.hasNext()) {
        org.apache.hadoop.mapreduce.Counter c = it.next();
        LOG.info("\t" + c.getDisplayName() + "\t" + c.getValue());
      }
    }
    */

    // add some more statistics
    Counters c = job.getCounters();
    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time_secs", ((double) cputime / 1000d));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Filter_run.Factory().toJSON(fr));

    Text line = new Text();
    LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000")));
    for (int i = 0; i < num_filter_results; i++) {
        reader.readLine(line);
        System.out.println(line.toString());
    }

    System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.chunk_stream_DocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;// w w  w.  j  a v  a  2s .c o m
    String out = null;

    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
                Log.info("TrainTest");
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "chunk -stream");
    job.setJarByClass(chunk_stream_DocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);

    job.setMapOutputValueClass(Text.class);
    job.setNumReduceTasks(1);
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    return status;

}