List of usage examples for org.apache.hadoop.mapreduce Job setJobName
public void setJobName(String name) throws IllegalStateException
From source file:eu.edisonproject.classification.tfidf.mapreduce.WordCountsForDocsDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(WordCountsForDocsDriver.class); job.setJobName("Word Counts For Docs Driver"); Path inPath = new Path(args[0]); Path outPath = new Path(args[1]); FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); outPath.getFileSystem(conf).delete(outPath, true); job.setMapperClass(WordCountsForDocsMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(WordCountsForDocsReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.classification.tfidf.mapreduce.WordFrequencyInDocDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // itemset = new LinkedList<String>(); // BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(args[2]))); // String line; // while ((line = br.readLine()) != null) { // String[] components = line.split("/"); // itemset.add(components[0]); // }/*from w w w .j a va2 s. c o m*/ Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(WordFrequencyInDocDriver.class); job.setJobName("Word Frequency In Doc Driver"); FileSystem fs = FileSystem.get(conf); fs.delete(new Path(args[1]), true); Path in = new Path(args[0]); Path inHdfs = in; Path dictionaryLocal = new Path(args[2]); Path dictionaryHDFS = dictionaryLocal; Path stopwordsLocal = new Path(args[3]); Path stopwordsHDFS = stopwordsLocal; if (!conf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { inHdfs = new Path(in.getName()); fs.delete(inHdfs, true); fs.copyFromLocalFile(in, inHdfs); fs.deleteOnExit(inHdfs); dictionaryHDFS = new Path(dictionaryLocal.getName()); if (!fs.exists(dictionaryHDFS)) { fs.copyFromLocalFile(dictionaryLocal, dictionaryHDFS); } stopwordsHDFS = new Path(stopwordsLocal.getName()); if (!fs.exists(stopwordsHDFS)) { fs.copyFromLocalFile(stopwordsLocal, stopwordsHDFS); } } FileStatus dictionaryStatus = fs.getFileStatus(dictionaryHDFS); dictionaryHDFS = dictionaryStatus.getPath(); job.addCacheFile(dictionaryHDFS.toUri()); FileStatus stopwordsStatus = fs.getFileStatus(stopwordsHDFS); stopwordsHDFS = stopwordsStatus.getPath(); job.addCacheFile(stopwordsHDFS.toUri()); FileInputFormat.setInputPaths(job, inHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapperClass(WordFrequencyInDocMapper.class); AvroJob.setInputKeySchema(job, Document.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(WordFrequencyInDocReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); FileSystem fs = FileSystem.get(jobconf); fs.delete(new Path(args[1]), true); Path in = new Path(args[0]); Path inHdfs = in;//ww w.j a va2 s. c om if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { inHdfs = new Path(in.getName()); fs.delete(inHdfs, true); fs.copyFromLocalFile(in, inHdfs); fs.deleteOnExit(inHdfs); FileStatus inHdfsStatus = fs.getFileStatus(inHdfs); // Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()}); } Job job = Job.getInstance(jobconf); Path stopwordsLocal = new Path(args[3]); stopwords = new Path(stopwordsLocal.getName()); fs.delete(stopwords, true); fs.copyFromLocalFile(stopwordsLocal, stopwords); fs.deleteOnExit(stopwords); FileStatus stopwordsStatus = fs.getFileStatus(stopwords); stopwords = stopwordsStatus.getPath(); job.addCacheFile(stopwords.toUri()); Path localDocs = new Path(args[2]); Path hdfsDocs = new Path(localDocs.getName()); fs.mkdirs(hdfsDocs); hdfsDocs = fs.getFileStatus(hdfsDocs).getPath(); fs.delete(hdfsDocs, true); // FileStatus[] stats = fs.listStatus(localDocs); File[] stats = new File(localDocs.toString()).listFiles(); for (File stat : stats) { // for (FileStatus stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) { Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } job.addCacheFile(hdfsDocs.toUri()); job.setJarByClass(TermWordFrequency.class); job.setJobName("Word Frequency Term Driver"); FileInputFormat.setInputPaths(job, inHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, inHdfs); NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4])); NLineInputFormat.setMaxInputSplitSize(job, 500); Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}", NLineInputFormat.getNumLinesPerSplit(job)); job.setMapperClass(TermWordFrequencyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(TermWordFrequencyReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.TFIDF.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (docs == null) { docs = new HashSet<>(); }/*from ww w .j av a 2 s . c o m*/ if (docs.isEmpty()) { CharArraySet stopWordArraySet = new CharArraySet(ConfigHelper.loadStopWords(args[3]), true); cleanStopWord = new StopWord(stopWordArraySet); File docsDir = new File(args[2]); for (File f : docsDir.listFiles()) { if (FilenameUtils.getExtension(f.getName()).endsWith("txt")) { ReaderFile rf = new ReaderFile(f.getAbsolutePath()); cleanStopWord.setDescription(rf.readFile()); docs.add(cleanStopWord.execute()); } } } Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(TFIDF.class); job.setJobName("IDF"); Path inPath = new Path(args[0]); Path outPath = new Path(args[1]); FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); outPath.getFileSystem(conf).delete(outPath, true); job.setMapperClass(IDFMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(IDFReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.WordCountsForDocsDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(WordCountsForDocsDriver.class); job.setJobName("Word Counts For Docs Driver"); Path inPath = new Path(args[0]); Path outPath = new Path(args[1]); FileSystem fs = FileSystem.get(conf); fs.delete(outPath, true);/* w w w . ja va 2 s . co m*/ FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); outPath.getFileSystem(conf).delete(outPath, true); job.setMapperClass(WordCountsForDocsMapper.class); // job.setInputFormatClass(NLineInputFormat.class); // NLineInputFormat.addInputPath(job, inPath); // NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[2])); // NLineInputFormat.setMaxInputSplitSize(job, 2000); /*Here it is possible put the combiner class job.setCombinerClass(AvroAverageCombiner.class); */ // job.setOutputFormatClass(AvroKeyValueOutputFormat.class); // job.setReducerClass(WordCountsForDocsReducer.class); // AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); // AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.STRING)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(WordCountsForDocsReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.WordFrequencyInDocDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); // if (isHaddopOn()) { // jobconf.set("fs.defaultFS", "hdfs://master.ib.cluster:8020"); // jobconf.set("mapred.job.tracker", "localhost:9001"); // }//from w w w .ja v a2s .c o m // try { new Path(args[1]).getFileSystem(jobconf).delete(new Path(args[1]), true); // } catch (java.net.ConnectException ex) { // // jobconf.set("fs.defaultFS", "file:///"); // jobconf.set("mapred.job.tracker", null); // new Path(args[1]).getFileSystem(jobconf).delete(new Path(args[1]), true); // } itemset = new LinkedList<String>(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(args[2]))); String line; while ((line = br.readLine()) != null) { String[] components = line.split("/"); itemset.add(components[0]); } Job job = Job.getInstance(jobconf); job.setJarByClass(WordFrequencyInDocDriver.class); job.setJobName("Word Frequency In Doc Driver"); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapperClass(WordFrequencyInDocMapper.class); AvroJob.setInputKeySchema(job, Term.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(WordFrequencyInDocReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.WordsGroupByTitleDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { //Configuration config = HBaseConfiguration.create(); Configuration conf = new Configuration(); Job job = Job.getInstance(conf); //TableMapReduceUtil.addDependencyJars(job); job.setJarByClass(WordsGroupByTitleDriver.class); //This row must be changed job.setJobName("Words Group By Title Driver"); Path inPath = new Path(args[0]); Path outPath = new Path(args[1]); FileSystem fs = FileSystem.get(conf); fs.delete(outPath, true);/* w ww . ja v a2 s .co m*/ FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); outPath.getFileSystem(conf).delete(outPath, true); job.setMapperClass(WordsGroupByTitleMapper.class); // job.setInputFormatClass(NLineInputFormat.class); // NLineInputFormat.addInputPath(job, inPath); // NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[2])); // NLineInputFormat.setMaxInputSplitSize(job, 2000); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(WordsGroupByTitleReducer.class); // job.setOutputFormatClass(AvroKeyValueOutputFormat.class); // job.setReducerClass(WordsGroupByTitleReducer.class); // AvroJob.setOutputKeySchema(job, TfidfDocument.SCHEMA$); // AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.STRING)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.WordsInCorpusTFIDFDriver.java
License:Apache License
@Override public int run(String[] rawArgs) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(WordsInCorpusTFIDFDriver.class); //This row must be changed job.setJobName(rawArgs[2]); FileSystem fs = FileSystem.get(conf); Path inPath = new Path(rawArgs[0]); Path outPath = new Path(rawArgs[1]); fs.delete(outPath, true);/*from w w w .j a v a 2 s. com*/ FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(WordsInCorpusTFIDFMapper.class); // job.setInputFormatClass(NLineInputFormat.class); // NLineInputFormat.addInputPath(job, inPath); // NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(rawArgs[3])); // NLineInputFormat.setMaxInputSplitSize(job, 2000); // job.setInputFormatClass(AvroKeyValueInputFormat.class); // job.setMapperClass(WordsInCorpusTFIDFMapper.class); // AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.STRING)); // AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING)); // // job.setOutputFormatClass(AvroKeyValueOutputFormat.class); // job.setReducerClass(WordsInCorpusTFIDFReducer.class); // AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); // AvroJob.setOutputValueSchema(job, Tfidf.getClassSchema()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(WordsInCorpusTFIDFReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.scape_project.tb.wc.archd.hadoop.HadoopArcReaderJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser gop = new GenericOptionsParser(conf, args); HadoopJobCliConfig pc = new HadoopJobCliConfig(); CommandLineParser cmdParser = new PosixParser(); CommandLine cmd = cmdParser.parse(HadoopJobOptions.OPTIONS, gop.getRemainingArgs()); if ((args.length == 0) || (cmd.hasOption(HadoopJobOptions.HELP_OPT))) { HadoopJobOptions.exit("Usage", 0); } else {/*from www . j a v a 2 s . c om*/ HadoopJobOptions.initOptions(cmd, pc); } String dir = pc.getDirStr(); String name = pc.getHadoopJobName(); if (name == null || name.equals("")) { name = "webarc_reader"; // default job name } Job job = new Job(conf); //********************************************************** // for debugging in local mode // comment out the 2 lines below befor switching to pseudo-distributed or fully-distributed mode // job.getConfiguration().set("mapred.job.tracker", "local"); // job.getConfiguration().set("fs.default.name", "local"); //********************************************************** FileInputFormat.setInputPaths(job, new Path(dir)); String outpath = "output/" + System.currentTimeMillis() + "wcr"; logger.info("Output directory: " + outpath); FileOutputFormat.setOutputPath(job, new Path(outpath)); job.setJarByClass(HadoopArcReaderJob.class); job.setJobName(name); //*** Set interface data types // We are using LONG because this value can become very large on huge archives. // In order to use the combiner function, also the map output needs to be a LONG. //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //*** Set up the mapper, combiner and reducer job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); //*** Set the MAP output compression //job.getConfiguration().set("mapred.compress.map.output", "true"); //*** Set input / output format job.setInputFormatClass(ArcInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //*** Start the job and wait for it boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:eu.scape_project.tb.wc.archd.mapreduce.FileCharacterisation.java
License:Apache License
public int run(String[] args) throws Exception { Job job = null;//Job.getInstance(getConf()); System.out.println(getConf().get("mapreduce.job.user.classpath.first")); for (int i = 0; i < args.length; i++) { System.out.println("Arg" + i + ": " + args[i]); }/* ww w .j a v a 2s . co m*/ FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(FileCharacterisation.class); job.setJobName(name); //*** Set interface data types // We are using LONG because this value can become very large on huge archives. // In order to use the combiner function, also the map output needs to be a LONG. //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //*** Set up the mapper, combiner and reducer job.setMapperClass(TikaMap.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); //*** Set the MAP output compression //job.getConfiguration().set("mapred.compress.map.output", "true"); //*** Set input / output format job.setInputFormatClass(ArcInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //*** Start the job and wait for it boolean success = job.waitForCompletion(true); return success ? 0 : 1; }