List of usage examples for org.apache.hadoop.mapreduce Job getCounters
public Counters getCounters() throws IOException
From source file:net.broomie.WordCoCounter.java
License:Apache License
/** * This method is implement for creating the dfdb with MapReduce. * @param conf Specify the conf object, which is hadoop Configuration. * @param dfdb Specify the dfdb directory path on HDFS. * @return Return `true' if success, return `false' if fail. * @throws IOException Exception for a input file IO. * @throws InterruptedException Exception for return waitForCompletion(). * @throws ClassNotFoundException Exception for Mapper and Reduce class. * @throws URISyntaxException Exception for new URI(). * The dfdb means `document frequency'.// w ww . java 2 s . c o m */ private boolean runWordCount(Configuration conf, String dfdb) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException { String reducerNum = conf.get(WORD_CO_COUNTER_REDUCER_NUM); Job job = new Job(conf); job.setJarByClass(WordCoCounter.class); TextInputFormat.addInputPath(job, new Path(in)); FileSystem fs = FileSystem.get(new URI(dfdb), conf); FileStatus[] status = fs.listStatus(new Path(dfdb)); if (status != null) { fs.delete(new Path(dfdb), true); } fs.close(); FileOutputFormat.setOutputPath(job, new Path(dfdb)); //job.setMapperClass(TokenizeMapper.class); job.setMapperClass(DFMapper.class); job.setReducerClass(TokenizeReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //job.setNumReduceTasks(Integer.valueOf(reducerNum)); job.setNumReduceTasks(Integer.valueOf(8)); boolean rv = job.waitForCompletion(true); if (rv) { Counters counters = job.getCounters(); long inputNum = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); FileSystem hdfs = FileSystem.get(conf); String numLinePath = conf.get(PROP_LINE_NUM); FSDataOutputStream stream = hdfs.create(new Path(numLinePath)); stream.writeUTF(String.valueOf((int) inputNum)); stream.close(); } return rv; }
From source file:net.java.jatextmining.JaCoOccurrence.java
License:Apache License
/** * Writing the numf of line of input file on HDFS. * @param conf Spefity the Hadoop Configuration object. * @param job Specify the Hadoop Job object. * @return if success return true, not success return false. * @throws IOException Exception for IO. *//* w w w.j av a 2 s . c o m*/ private boolean writeDocNumFile(Configuration conf, Job job) throws IOException { Counters counters = job.getCounters(); inputNum = (int) counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); if (inputNum == 0) { return false; } FileSystem hdfs = FileSystem.get(conf); String docNumPath = conf.get("jatextmining.docNumPath"); if (docNumPath == null) { return false; } FSDataOutputStream stream = hdfs.create(new Path(docNumPath)); stream.writeUTF(String.valueOf((int) inputNum)); stream.close(); return true; }
From source file:net.java.jatextmining.JaWordCounter.java
License:Apache License
/** * Creating the DF database from Japanese documents. * @param conf Specify the Hadoop Configuration object. * @param dfdb Specify the saving path for DF database. * @return If success return true, it not success return false. * @throws IOException Exception for IO. * @throws URISyntaxException Exception for DF database URI. * @throws InterruptedException Exception for waitForCompletion(). * @throws ClassNotFoundException Exception for waitForCompletion(). */// w ww .j a v a 2 s . c om private boolean runCreateDFDB(Configuration conf, String dfdb) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException { String reducerNum = conf.get("jatextmining.JaWordCounterReducerNum"); Job job = new Job(conf); job.setJarByClass(JaWordCounter.class); TextInputFormat.addInputPath(job, new Path(dfIn)); FileOutputFormat.setOutputPath(job, new Path(dfdb)); FileSystem fs = FileSystem.get(new URI(dfdb), conf); FileStatus[] status = fs.listStatus(new Path(dfdb)); if (status != null) { fs.delete(new Path(dfdb), true); } fs.close(); job.setMapperClass(CountMapper.class); job.setReducerClass(CountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setNumReduceTasks(Integer.valueOf(reducerNum)); boolean rv = job.waitForCompletion(true); if (rv) { Counters counters = job.getCounters(); long docNum = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); FileSystem hdfs = FileSystem.get(conf); String docNumPath = conf.get("jatextmining.docNum"); FSDataOutputStream stream = hdfs.create(new Path(docNumPath)); stream.writeUTF(String.valueOf((int) docNum)); stream.close(); } return rv; }
From source file:nl.cwi.hadoop.kba.stat.ToyKbaDocExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w w w . j a va2 s . co m String out = null; String queryfile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-c".equals(args[i])) { corpus_id = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Toy KBA system"); job.setJarByClass(ToyKbaDocExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StreamItemWritable.class); // job.setCombinerClass(MyReducer.class); //job.setReducerClass(MyReducer.class); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StreamItemWritable.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; /* for (String g : job.getCounters().getGroupNames()) { Iterator<org.apache.hadoop.mapreduce.Counter> it = job.getCounters() .getGroup(g).iterator(); LOG.info(g + "\t" + job.getCounters().getGroup(g).getDisplayName()); while (it.hasNext()) { org.apache.hadoop.mapreduce.Counter c = it.next(); LOG.info("\t" + c.getDisplayName() + "\t" + c.getValue()); } } */ // add some more statistics Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Filter_run.Factory().toJSON(fr)); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString()); } System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.hadoop.kba.stat.ToyKbaSystem.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from ww w . j a v a 2 s .co m String out = null; String queryfile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-c".equals(args[i])) { corpus_id = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Toy KBA system"); job.setJarByClass(ToyKbaSystem.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringLongPair.class); // job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; /* for (String g : job.getCounters().getGroupNames()) { Iterator<org.apache.hadoop.mapreduce.Counter> it = job.getCounters() .getGroup(g).iterator(); LOG.info(g + "\t" + job.getCounters().getGroup(g).getDisplayName()); while (it.hasNext()) { org.apache.hadoop.mapreduce.Counter c = it.next(); LOG.info("\t" + c.getDisplayName() + "\t" + c.getValue()); } } */ // add some more statistics Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Filter_run.Factory().toJSON(fr)); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString()); } System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba.apps.EntitySurfaceForms.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// ww w . j a v a2 s .c om String out = null; String queryfile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; String gcldFile = null; String labelsFile = null; String pprFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-c".equals(args[i])) { corpus_id = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Toy KBA system"); job.setJarByClass(EntitySurfaceForms.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Filter_run.Factory().toJSON(fr)); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString()); } System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba.apps.FeatureExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w w w. j a va2 s . c o m String out = null; String queryfile = null; String contextFile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; String annoFile = null; String gcldFile = null; String labelsFile = null; String pprFile = null; String myverFile = null; String wikiFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-s".equals(args[i])) { myverFile = args[++i]; } else if ("-c".equals(args[i])) { contextFile = args[++i]; } else if ("-w".equals(args[i])) { wikiFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Map<String, String> Attr = new LinkedHashMap<String, String>(); // Attr.put("trec-kba", ""); /* Attr.put("LengthTitle", ""); Attr.put("LengthBody", ""); Attr.put("LengthAnchor", ""); Attr.put("Source", ""); Attr.put("English", ""); Attr.put("MentionsTitle", ""); Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", ""); Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread", ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", ""); Attr.put("SpreadNorm", ""); // Attr.put("Related", ""); Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", ""); Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld", ""); Attr.put("partial", ""); Attr.put("s_form", ""); Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos", "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class", ""); */ Attr.put("gcld", "0"); Attr.put("jac", "0"); Attr.put("cos", "0"); Attr.put("kl", "0"); Attr.put("ppr", "0"); Attr.put("s_form", "0"); Attr.put("contxR", "0"); Attr.put("contxL", "0"); Attr.put("FirstPos", "0"); Attr.put("LastPos", "0"); Attr.put("LengthBody", "0"); Attr.put("FirstPosNorm", "0"); Attr.put("MentionsBody", "0"); Attr.put("RelatedBody", "0"); Attr.put("Spread", "0"); Attr.put("LastPosNorm", "0"); Attr.put("SpreadNorm", "0"); Attr.put("LengthAnchor", "0"); Attr.put("Source", "0"); Attr.put("LengthTitle", "0"); Attr.put("partial", "0"); Attr.put("MentionsAnchor", "0"); Attr.put("Relatedtitle", "0"); Attr.put("English", "0"); Attr.put("RelatedAnchor", "0"); Attr.put("MentionsTitle", "0"); Attr.put("Class", "0"); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); //conf.set(MYVER, new Path(myverFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString()); conf.set(WIKI_HDFS, new Path(contextFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); // set time conf.setLong("mapred.task.timeout", 40 * 600000); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(FeatureExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration()); // DistributedCache.addCacheFile( new URI(new Path(myverFile) + "#" + //MYVER), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); // System.out.println("#" + new Filter_run.Factory().toJSON(fr)); System.out.println("@RELATION" + " trec-kba" + " "); for (String key : Attr.keySet()) { if (key.equalsIgnoreCase("English")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}"); } else if (key.equalsIgnoreCase("Class")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1}"); } else { System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC"); } } System.out.println("\n@DATA"); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString().split("\t\t")[1]); } /* * System.out.println("#" + new * Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); */ return status; }
From source file:nl.cwi.kba.apps.FeatureExtractor_filterer.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*from w w w . j ava 2 s . c om*/ String out = null; String queryfile = null; String contextFile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; String annoFile = null; String gcldFile = null; String labelsFile = null; String pprFile = null; String myverFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Map<String, String> Attr = new LinkedHashMap<String, String>(); // Attr.put("trec-kba", ""); Attr.put("LengthTitle", ""); Attr.put("LengthBody", ""); Attr.put("LengthAnchor", ""); Attr.put("Source", ""); Attr.put("English", ""); Attr.put("MentionsTitle", ""); Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", ""); Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread", ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", ""); Attr.put("SpreadNorm", ""); // Attr.put("Related", ""); Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", ""); Attr.put("RelatedAnchor", ""); //Attr.put("contxL", "0"); //Attr.put("contxR", "0"); Attr.put("Class", ""); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); //set time conf.setLong("mapred.task.timeout", 40 * 600000); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Toy KBA system"); job.setJarByClass(FeatureExtractor_filterer.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); // System.out.println("#" + new Filter_run.Factory().toJSON(fr)); System.out.println("@RELATION" + " trec-kba" + " "); for (String key : Attr.keySet()) { if (key.equalsIgnoreCase("English")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}"); } else if (key.equalsIgnoreCase("Class")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1}"); } else { System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC"); } } System.out.println("\n@DATA"); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString().split("\t\t")[1]); } return status; }
From source file:nl.cwi.kba.apps.KbaDocExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//www. j a va 2s .c o m String out = null; String queryfile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-c".equals(args[i])) { corpus_id = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Toy KBA system"); job.setJarByClass(KbaDocExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StreamItemWritable.class); // job.setCombinerClass(MyReducer.class); //job.setReducerClass(MyReducer.class); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StreamItemWritable.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; /* for (String g : job.getCounters().getGroupNames()) { Iterator<org.apache.hadoop.mapreduce.Counter> it = job.getCounters() .getGroup(g).iterator(); LOG.info(g + "\t" + job.getCounters().getGroup(g).getDisplayName()); while (it.hasNext()) { org.apache.hadoop.mapreduce.Counter c = it.next(); LOG.info("\t" + c.getDisplayName() + "\t" + c.getValue()); } } */ // add some more statistics Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Filter_run.Factory().toJSON(fr)); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString()); } System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba2013.apps.chunk_stream_DocExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w w w. j a v a 2s .c o m String out = null; String traintestFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-tt".equals(args[i])) { traintestFile = args[++i]; Log.info("TrainTest"); } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "chunk -stream"); job.setJarByClass(chunk_stream_DocExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); return status; }