List of usage examples for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass
@SuppressWarnings("unchecked") public static void setOutputFormatClass(Job job, Class<? extends OutputFormat> theClass)
From source file:io.dataapps.chlorine.hadoop.HDFSScanMR.java
License:Apache License
public static Job makeJob(Configuration conf, Path in, Path out, String matchPath, long scanSince, String chlorineConfigFilePath, String queue, String maskPath) throws IOException { conf.setBoolean("mapred.output.compress", false); conf.setLong("scanSince", scanSince); conf.set("matchPath", matchPath); conf.set("maskPath", maskPath); conf.set("inputPath", in.toString()); if (queue != null) { conf.set("mapred.job.queue.name", queue); }/*from w w w. j a v a 2s . c om*/ conf.set("fs.permissions.umask-mode", "007"); conf.setInt("input_path_depth", in.depth()); Job job = Job.getInstance(conf, "Chlorine_HDFS_Scan"); job.setJarByClass(HDFSScanMR.class); if (chlorineConfigFilePath != null) { try { job.addCacheFile(new URI(chlorineConfigFilePath)); conf.set("finder_file", (new File(chlorineConfigFilePath)).getName()); } catch (URISyntaxException e) { LOG.error(e); } } job.setMapperClass(DeepScanMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, in); TextInputFormat.setInputDirRecursive(job, true); TextInputFormat.setInputPathFilter(job, NewFilesFilter.class); FileOutputFormat.setOutputPath(job, out); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); return job; }
From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage();//from www.j a v a2 s.c om return 1; } try { Job job = new Job(new Configuration()); job.setJobName(getClass().getName()); Configuration conf = job.getConfiguration(); CommandLine results = cli.parse(conf, args); if (results.hasOption("input")) { Path path = new Path(results.getOptionValue("input")); FileInputFormat.setInputPaths(job, path); } if (results.hasOption("output")) { Path path = new Path(results.getOptionValue("output")); FileOutputFormat.setOutputPath(job, path); } if (results.hasOption("jar")) { job.setJar(results.getOptionValue("jar")); } if (results.hasOption("inputformat")) { explicitInputFormat = true; setIsJavaRecordReader(conf, true); job.setInputFormatClass(getClass(results, "inputformat", conf, InputFormat.class)); } if (results.hasOption("javareader")) { setIsJavaRecordReader(conf, true); } if (results.hasOption("map")) { setIsJavaMapper(conf, true); job.setMapperClass(getClass(results, "map", conf, Mapper.class)); } if (results.hasOption("partitioner")) { job.setPartitionerClass(getClass(results, "partitioner", conf, Partitioner.class)); } if (results.hasOption("reduce")) { setIsJavaReducer(conf, true); job.setReducerClass(getClass(results, "reduce", conf, Reducer.class)); } if (results.hasOption("reduces")) { job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces"))); } if (results.hasOption("writer")) { explicitOutputFormat = true; setIsJavaRecordWriter(conf, true); job.setOutputFormatClass(getClass(results, "writer", conf, OutputFormat.class)); } if (results.hasOption("lazyOutput")) { if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormatClass()); } } if (results.hasOption("avroInput")) { avroInput = AvroIO.valueOf(results.getOptionValue("avroInput").toUpperCase()); } if (results.hasOption("avroOutput")) { avroOutput = AvroIO.valueOf(results.getOptionValue("avroOutput").toUpperCase()); } if (results.hasOption("program")) { setExecutable(conf, results.getOptionValue("program")); } // if they gave us a jar file, include it into the class path String jarFile = job.getJar(); if (jarFile != null) { final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() }; // FindBugs complains that creating a URLClassLoader should be // in a doPrivileged() block. ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() { public ClassLoader run() { return new URLClassLoader(urls); } }); conf.setClassLoader(loader); } setupPipesJob(job); return job.waitForCompletion(true) ? 0 : 1; } catch (ParseException pe) { LOG.info("Error : " + pe); cli.printUsage(); return 1; } }
From source file:nl.cwi.kba2013.apps.chunk_stream_DocExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// ww w . j a v a 2s.c o m String out = null; String traintestFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-tt".equals(args[i])) { traintestFile = args[++i]; Log.info("TrainTest"); } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "chunk -stream"); job.setJarByClass(chunk_stream_DocExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); return status; }
From source file:nl.cwi.kba2013.apps.FeatureExtractor_DocExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*from w w w . ja v a 2 s. com*/ String out = null; String queryfile = null; String contextFile = null; String systemdescription = null; String systemdescription_short = null; String corpus_id = null; String runtag = null; String teamname = null; String annoFile = null; String gcldFile = null; String labelsFile = null; String pprFile = null; String myverFile = null; String wikiFile = null; String traintestFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-ds".equals(args[i])) { systemdescription_short = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-s".equals(args[i])) { myverFile = args[++i]; } else if ("-c".equals(args[i])) { contextFile = args[++i]; } else if ("-w".equals(args[i])) { wikiFile = args[++i]; } else if ("-tt".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; if (systemdescription_short == null) systemdescription_short = "a two -step classification approach"; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short, corpus_id); Map<String, String> Attr = new LinkedHashMap<String, String>(); // Attr.put("trec-kba", ""); /* * Attr.put("LengthTitle", ""); Attr.put("LengthBody", ""); * Attr.put("LengthAnchor", ""); Attr.put("Source", ""); * Attr.put("English", ""); Attr.put("MentionsTitle", ""); * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", ""); * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread", * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", ""); * Attr.put("SpreadNorm", ""); // Attr.put("Related", ""); * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", ""); * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld", * ""); Attr.put("partial", ""); Attr.put("s_form", ""); * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos", * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class", * ""); */ Attr.put("gcld", "0"); Attr.put("jac", "0"); Attr.put("cos", "0"); Attr.put("kl", "0"); Attr.put("ppr", "0"); Attr.put("s_form", "0"); Attr.put("contxR", "0"); Attr.put("contxL", "0"); Attr.put("FirstPos", "0"); Attr.put("LastPos", "0"); Attr.put("LengthBody", "0"); Attr.put("FirstPosNorm", "0"); Attr.put("MentionsBody", "0"); Attr.put("RelatedBody", "0"); Attr.put("Spread", "0"); Attr.put("LastPosNorm", "0"); Attr.put("SpreadNorm", "0"); Attr.put("LengthAnchor", "0"); Attr.put("Source", "0"); Attr.put("LengthTitle", "0"); Attr.put("partial", "0"); Attr.put("MentionsAnchor", "0"); Attr.put("Relatedtitle", "0"); Attr.put("English", "0"); Attr.put("RelatedAnchor", "0"); Attr.put("MentionsTitle", "0"); Attr.put("Class", "0"); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); conf.set(MYVER, new Path(myverFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString()); conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(FeatureExtractor_DocExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); /* job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; */ //job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(TextOutputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(StreamItemWritable.class); //job.setMapOutputValueClass(Text.class); //job.setOutputKeyClass(StreamItemWritable.class); // job.setCombinerClass(MyReducer.class); //job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Run_info.Factory().toJSON(fr)); System.out.println("@RELATION" + " trec-kba" + " "); for (String key : Attr.keySet()) { if (key.equalsIgnoreCase("English")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}"); } else if (key.equalsIgnoreCase("Class")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1}"); } else { System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC"); } } System.out.println("\n@DATA"); // Text line = new Text(); // LineReader reader = new LineReader(fs.open(new Path(out // + "/part-r-00000"))); // for (int i = 0; i < num_filter_results; i++) { // reader.readLine(line); // System.out.println(line.toString().split("\t\t")[1]); // } System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba2013.apps.KBaDocExtractorFromCleansed.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from w w w. j a v a 2 s. com String out = null; String traintestFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-t".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } Configuration conf = getConf(); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Annotation Extraction"); job.setJarByClass(KBaDocExtractorFromCleansed.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StreamItemWritable.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); return status; }
From source file:nl.cwi.kba2013.apps.KbaExtractMissingFromRaw.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*from w w w . j a va2s .c om*/ String out = null; String queryfile = null; String labelsFile = null; String traintestFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-t".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Missing Extractor"); job.setJarByClass(KbaExtractMissingFromRaw.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); //let's see it crushing again //job.setInputFormatClass(ThriftFileInputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); //job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KbaNameVariantMatch.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from w ww . ja va 2s. c o m String out = null; String queryfile = null; String labelsFile = null; String traintestFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-t".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KbaNameVariantMatch.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); //job.setInputFormatClass(ThriftFileInputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KbaNameVariantMatchFeatureExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*from www .ja va 2 s . c o m*/ String out = null; String queryfile = null; String contextFile = null; String systemdescription = null; String systemdescription_short = null; String corpus_id = null; String runtag = null; String teamname = null; String annoFile = null; String gcldFile = null; String labelsFile = null; String pprFile = null; String myverFile = null; String wikiFile = null; String traintestFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-ds".equals(args[i])) { systemdescription_short = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-s".equals(args[i])) { myverFile = args[++i]; } else if ("-c".equals(args[i])) { contextFile = args[++i]; } else if ("-w".equals(args[i])) { wikiFile = args[++i]; } else if ("-tt".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; if (systemdescription_short == null) systemdescription_short = "a two -step classification approach"; // LOG.info("Tool: " + this.getClass().getName()); // LOG.info(" - input path: " + in); // LOG.info(" - output path: " + out); // LOG.info(" - runtag: " + runtag); // LOG.info(" - teamname: " + teamname); // LOG.info(" - corpus_id: " + corpus_id); // LOG.info(" - run description: " + systemdescription); Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short, corpus_id); Map<String, String> Attr = new LinkedHashMap<String, String>(); // Attr.put("trec-kba", ""); /* * Attr.put("LengthTitle", ""); Attr.put("LengthBody", ""); * Attr.put("LengthAnchor", ""); Attr.put("Source", ""); * Attr.put("English", ""); Attr.put("MentionsTitle", ""); * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", ""); * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread", * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", ""); * Attr.put("SpreadNorm", ""); // Attr.put("Related", ""); * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", ""); * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld", * ""); Attr.put("partial", ""); Attr.put("s_form", ""); * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos", * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class", * ""); */ Attr.put("gcld", "0"); Attr.put("jac", "0"); Attr.put("cos", "0"); Attr.put("kl", "0"); Attr.put("ppr", "0"); Attr.put("s_form", "0"); Attr.put("contxR", "0"); Attr.put("contxL", "0"); Attr.put("FirstPos", "0"); Attr.put("LastPos", "0"); Attr.put("LengthBody", "0"); Attr.put("FirstPosNorm", "0"); Attr.put("MentionsBody", "0"); Attr.put("RelatedBody", "0"); Attr.put("Spread", "0"); Attr.put("LastPosNorm", "0"); Attr.put("SpreadNorm", "0"); Attr.put("LengthAnchor", "0"); Attr.put("Source", "0"); Attr.put("LengthTitle", "0"); Attr.put("partial", "0"); Attr.put("MentionsAnchor", "0"); Attr.put("Relatedtitle", "0"); Attr.put("English", "0"); Attr.put("RelatedAnchor", "0"); Attr.put("MentionsTitle", "0"); Attr.put("Class", "0"); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); conf.set(MYVER, new Path(myverFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString()); conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KbaNameVariantMatchFeatureExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); /* job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; */ //job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(TextOutputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(StreamItemWritable.class); //job.setMapOutputValueClass(Text.class); //job.setOutputKeyClass(StreamItemWritable.class); // job.setCombinerClass(MyReducer.class); //job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Run_info.Factory().toJSON(fr)); System.out.println("@RELATION" + " trec-kba" + " "); for (String key : Attr.keySet()) { if (key.equalsIgnoreCase("English")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}"); } else if (key.equalsIgnoreCase("Class")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1}"); } else { System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC"); } } System.out.println("\n@DATA"); // Text line = new Text(); // LineReader reader = new LineReader(fs.open(new Path(out // + "/part-r-00000"))); // for (int i = 0; i < num_filter_results; i++) { // reader.readLine(line); // System.out.println(line.toString().split("\t\t")[1]); // } System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba2013.apps.KBANameVariantMatchTHERank.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*w w w . j ava 2 s. c o m*/ String out = null; String labelsFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || labelsFile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KBANameVariantMatchTHERank.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(50); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KBANER.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w ww.j a v a 2 s. c om String out = null; String labelsFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || labelsFile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KBANER.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }