List of usage examples for org.apache.hadoop.mapreduce Job addCacheFile
public void addCacheFile(URI uri)
From source file:edu.gslis.ts.hadoop.ThriftSentenceScorerHbase.java
License:Apache License
public int run(String[] args) throws Exception { String tableName = args[0];//from ww w.j a va 2 s. co m Path topicsFile = new Path(args[1]); Path vocabFile = new Path(args[2]); Path outputPath = new Path(args[3]); // String queryId = args[1]; Configuration config = HBaseConfiguration.create(getConf()); Job job = Job.getInstance(config); job.setJarByClass(ThriftSentenceScorerHbase.class); Scan scan = new Scan(); scan.setCaching(500); scan.setCacheBlocks(false); /* Filter prefixFilter = new PrefixFilter(Bytes.toBytes(queryId)); scan.setFilter(prefixFilter); */ TableMapReduceUtil.initTableMapperJob(tableName, scan, ThriftTableMapper.class, Text.class, // mapper output key Text.class, // mapper output value job); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); FileOutputFormat.setOutputPath(job, outputPath); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } return 0; }
From source file:edu.umd.gorden2.PairsPMI.java
License:Apache License
/** * Runs this tool.//w w w.j av a2 s . c o m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PairsPMI.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); // // Pair job Job job = Job.getInstance(getConf()); job.setJobName(PairsPMI.class.getSimpleName()); job.setJarByClass(PairsPMI.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(PairOfStrings.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(PairOfStrings.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); //job.setPartitionerClass(MyPartitioner.class); job.addCacheFile(new URI("wc/part-r-00000")); long startTime = System.currentTimeMillis(); // // wordcount job Job job2 = Job.getInstance(getConf()); job2.setJobName("Wordcount"); job2.setJarByClass(PairsPMI.class); String outputPath2 = "wc"; // Delete the output directory if it exists already. Path outputDir2 = new Path(outputPath2); FileSystem.get(getConf()).delete(outputDir2, true); job2.setNumReduceTasks(1); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath2)); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(IntWritable.class); job2.setMapperClass(MyMapper2.class); job2.setCombinerClass(MyReducer2.class); job2.setReducerClass(MyReducer2.class); // add side file to job1 job.addCacheFile(new URI("wc/part-r-00000")); job2.waitForCompletion(true); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.gorden2.StripesPMI.java
License:Apache License
/** * Runs this tool.//from w w w . ja va 2 s . c o m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + StripesPMI.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(StripesPMI.class.getSimpleName()); job.setJarByClass(StripesPMI.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(HMapStFW.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.addCacheFile(new URI("wc/part-r-00000")); // // wordcount job Job job2 = Job.getInstance(getConf()); job2.setJobName("Wordcount"); job2.setJarByClass(PairsPMI.class); String outputPath2 = "wc"; // Delete the output directory if it exists already. Path outputDir2 = new Path(outputPath2); FileSystem.get(getConf()).delete(outputDir2, true); job2.setNumReduceTasks(1); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath2)); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(IntWritable.class); job2.setMapperClass(MyMapper2.class); job2.setCombinerClass(MyReducer2.class); job2.setReducerClass(MyReducer2.class); long startTime = System.currentTimeMillis(); job2.waitForCompletion(true); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.honghongie.PairsPMI.java
License:Apache License
/** * Runs this tool./* www . ja v a2 s. co m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); // options.addOption(OptionBuilder.withArgName("num").hasArg() // .withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; // int window = cmdline.hasOption(WINDOW) ? // Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2; LOG.info("Tool: " + PairsPMI.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); // LOG.info(" - window: " + window); LOG.info(" - number of reducers: " + reduceTasks); //JobConf conf = new JobConf(PairsPMI.class); // first job //Job job1 = new Job (conf,"join1"); Configuration conf1 = getConf(); Job job1 = Job.getInstance(conf1); job1.setJobName(PairsPMI.class.getSimpleName()); job1.setJarByClass(PairsPMI.class); job1.setNumReduceTasks(1); //ensure go to one file //file path of job1 // Delete the output directory if it exist Path dir = new Path("temp"); FileSystem.get(getConf()).delete(dir, true); FileInputFormat.setInputPaths(job1, new Path(inputPath)); FileOutputFormat.setOutputPath(job1, new Path("temp")); job1.setMapperClass(Map_First.class); job1.setCombinerClass(MyCombiner.class); job1.setReducerClass(Reduce_First.class); job1.setMapOutputKeyClass(Text.class);//map output key job1.setMapOutputValueClass(IntWritable.class);//map output value job1.setOutputKeyClass(Text.class);//reduce output key job1.setOutputValueClass(IntWritable.class);//reduce output value // ControlledJob ctrljob1=new ControlledJob(conf); // ctrljob1.setJob(job1); long startTime1 = System.currentTimeMillis(); job1.waitForCompletion(true); System.out.println( "First Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); //begin job2 //Configuration conf2 = getConf(); Job job2 = Job.getInstance(getConf()); job2.setJobName(PairsPMI.class.getSimpleName()); job2.setJarByClass(PairsPMI.class); job2.setNumReduceTasks(reduceTasks); //delete the output directory if it exists. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); //file path of job2 FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.addCacheFile(new URI("temp/part-r-00000")); job2.setMapperClass(Map_Second.class); job2.setCombinerClass(MyCombiner_Second.class); job2.setReducerClass(Reduce_Second.class); job2.setMapOutputKeyClass(PairOfStrings.class);//map output key job2.setMapOutputValueClass(FloatWritable.class);//map output value job2.setOutputKeyClass(PairOfStrings.class);//reduce output key job2.setOutputValueClass(FloatWritable.class);//reduce output value long startTime2 = System.currentTimeMillis(); job2.waitForCompletion(true); System.out.println( "Second Job Finished in " + (System.currentTimeMillis() - startTime2) / 1000.0 + " seconds"); System.out.println( "Total Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); System.out.println("Total number of lines:" + lines); return 0; }
From source file:edu.umd.honghongie.StripesPMI.java
License:Apache License
/** * Runs this tool.//from w ww .j a va 2 s . co m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); // options.addOption(OptionBuilder.withArgName("num").hasArg() // .withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; // int window = cmdline.hasOption(WINDOW) ? // Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2; LOG.info("Tool: " + StripesPMI.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); // LOG.info(" - window: " + window); LOG.info(" - number of reducers: " + reduceTasks); //JobConf conf = new JobConf(PairsPMI.class); // first job //Job job1 = new Job (conf,"join1"); Configuration conf1 = getConf(); Job job1 = Job.getInstance(conf1); job1.setJobName(StripesPMI.class.getSimpleName()); job1.setJarByClass(StripesPMI.class); job1.setNumReduceTasks(1); //file path of job1 // Delete the output directory if it exist Path dir = new Path("temp"); FileSystem.get(getConf()).delete(dir, true); FileInputFormat.setInputPaths(job1, new Path(inputPath)); FileOutputFormat.setOutputPath(job1, new Path("temp")); job1.setMapperClass(Map_First.class); job1.setCombinerClass(MyCombiner.class); job1.setReducerClass(Reduce_First.class); job1.setMapOutputKeyClass(Text.class);//map output key job1.setMapOutputValueClass(IntWritable.class);//map output value job1.setOutputKeyClass(Text.class);//reduce output key job1.setOutputValueClass(IntWritable.class);//reduce output value // ControlledJob ctrljob1=new ControlledJob(conf); // ctrljob1.setJob(job1); long startTime1 = System.currentTimeMillis(); job1.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); //begin job2 //Configuration conf2 = getConf(); Job job2 = Job.getInstance(getConf()); job2.setJobName(StripesPMI.class.getSimpleName()); job2.setJarByClass(StripesPMI.class); job2.setNumReduceTasks(reduceTasks); //delete the output directory if it exists. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); //file path of job2 FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.addCacheFile(new URI("temp/part-r-00000")); job2.setMapperClass(Map_Second.class); job2.setReducerClass(Reduce_Second.class); job2.setMapOutputKeyClass(Text.class);//map output key job2.setMapOutputValueClass(HMapStIW.class);//map output value job2.setOutputKeyClass(PairOfStrings.class);//reduce output key job2.setOutputValueClass(FloatWritable.class);//reduce output value long startTime2 = System.currentTimeMillis(); job2.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime2) / 1000.0 + " seconds"); System.out .println("Total Job Finished in" + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); System.out.println("total number of lines:" + lines); return 0; }
From source file:edu.umd.windmemory.PMIPairs.java
License:Apache License
/** * Runs this tool.// w w w. j a v a 2 s. c o m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairs.class.getSimpleName()); job.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairs.class.getSimpleName()); job2.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", "temp"); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(PairOfStrings.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.PMIPairsR.java
License:Apache License
/** * Runs this tool.//w ww. j a v a2s. c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairsR.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairsR.class.getSimpleName()); job.setJarByClass(PMIPairsR.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); // job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairsR.class.getSimpleName()); job2.setJarByClass(PMIPairsR.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", "temp"); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(PairOfStrings.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); // job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.PMIStripes.java
License:Apache License
/** * Runs this tool./*w ww.j av a 2 s . c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairs.class.getSimpleName()); job.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairs.class.getSimpleName()); job2.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", interDir.toString()); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(HMapStIW.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:eu.edisonproject.classification.tfidf.mapreduce.CompetencesDistanceDriver.java
License:Apache License
@Override public int run(String[] args) { try {/*from w w w .java 2 s.c o m*/ Configuration conf = HBaseConfiguration.create(); //additional output using TextOutputFormat. conf.set("file.names", args[3]); Job job = Job.getInstance(conf); //TableMapReduceUtil.addDependencyJars(job); job.setJarByClass(CompetencesDistanceDriver.class); //This row must be changed job.setJobName("Words Group By Title Driver"); Path inPath = new Path(args[0]); Path outPath = new Path(args[1]); Path competencesPath = new Path(args[2]); Path competencesPathHDFS = competencesPath; FileSystem fs = FileSystem.get(conf); if (!conf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { competencesPathHDFS = new Path(competencesPath.getName()); if (!fs.exists(competencesPathHDFS)) { fs.mkdirs(competencesPathHDFS); File[] stats = new File(competencesPath.toString()).listFiles(); for (File stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("csv")) { Path dest = new Path(competencesPathHDFS.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } } } job.addCacheFile(competencesPathHDFS.toUri()); FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); fs.delete(outPath, true); job.setMapperClass(CompetencesDistanceMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(CompetencesDistanceReducer.class); // job.setOutputFormatClass(TableOutputFormat.class); // job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "jobpostcompetence"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); String[] fileNames = args[3].split(","); for (String n : fileNames) { MultipleOutputs.addNamedOutput(job, n, TextOutputFormat.class, Text.class, Text.class); } return (job.waitForCompletion(true) ? 0 : 1); } catch (IOException | IllegalStateException | IllegalArgumentException | InterruptedException | ClassNotFoundException ex) { Logger.getLogger(CompetencesDistanceDriver.class.getName()).log(Level.SEVERE, null, ex); } return 0; }
From source file:eu.edisonproject.classification.tfidf.mapreduce.TermWordFrequency.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); Job job = Job.getInstance(jobconf); FileSystem fs = FileSystem.get(jobconf); fs.delete(new Path(args[1]), true); Path dictionary = new Path(args[0]); Path dictionaryHdfs = dictionary; Path localDocs = new Path(args[2]); Path hdfsDocs = localDocs;/*from w ww . j a v a2 s . c om*/ Path stopwordsLocal = new Path(args[3]); Path stopwordsHDFS = stopwordsLocal; if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { dictionaryHdfs = new Path(dictionary.getName()); if (!fs.exists(dictionaryHdfs)) { fs.copyFromLocalFile(dictionary, dictionaryHdfs); } hdfsDocs = new Path(localDocs.getName()); fs.mkdirs(hdfsDocs); fs.deleteOnExit(hdfsDocs); File[] stats = new File(localDocs.toString()).listFiles(); for (File stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) { Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } stopwordsHDFS = new Path(stopwordsLocal.getName()); if (!fs.exists(stopwordsHDFS)) { fs.copyFromLocalFile(stopwordsLocal, stopwordsHDFS); } } FileStatus stopwordsStatus = fs.getFileStatus(stopwordsHDFS); stopwordsHDFS = stopwordsStatus.getPath(); job.addCacheFile(stopwordsHDFS.toUri()); job.addCacheFile(hdfsDocs.toUri()); job.setJarByClass(TermWordFrequency.class); job.setJobName("Word Frequency Term Driver"); FileInputFormat.setInputPaths(job, dictionaryHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dictionaryHdfs); NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4])); NLineInputFormat.setMaxInputSplitSize(job, 500); job.setMapperClass(TermWordFrequencyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(TermWordFrequencyReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }