List of usage examples for org.apache.hadoop.mapreduce Job setJobName
public void setJobName(String name) throws IllegalStateException
From source file:edu.umd.windmemory.PMIPairsR.java
License:Apache License
/** * Runs this tool.//from w w w . j a va2 s . c o m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairsR.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairsR.class.getSimpleName()); job.setJarByClass(PMIPairsR.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); // job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairsR.class.getSimpleName()); job2.setJarByClass(PMIPairsR.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", "temp"); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(PairOfStrings.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); // job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.PMIStripes.java
License:Apache License
/** * Runs this tool./*from w ww.ja v a2 s.c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairs.class.getSimpleName()); job.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairs.class.getSimpleName()); job2.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", interDir.toString()); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(HMapStIW.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.RunPersonalizedPageRankBasic.java
License:Apache License
private float[] phase1(int i, int j, String basePath, int numNodes, String sources) throws Exception { Job job = Job.getInstance(getConf()); job.setJobName("PageRank:Basic:iteration" + j + ":Phase1"); job.setJarByClass(RunPersonalizedPageRankBasic.class); String in = basePath + "/iter" + formatter.format(i); String out = basePath + "/iter" + formatter.format(j) + "t"; String outm = out + "-mass"; // We need to actually count the number of part files to get the number of partitions (because // the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;/*from w w w. j a va 2s . c om*/ } LOG.info("PageRank: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + numNodes); LOG.info(" - source nodes" + sources); // LOG.info(" - useInmapCombiner: " + useInMapperCombiner); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; job.getConfiguration().setInt("NodeCount", numNodes); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.getConfiguration().set("PageRankMassPath", outm); job.getConfiguration().set("source.nodes", sources); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); FileSystem.get(getConf()).delete(new Path(out), true); FileSystem.get(getConf()).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int num = sources.split(",").length; float[] mass = new float[num]; for (int m = 0; m < num; m++) { mass[m] = Float.NEGATIVE_INFINITY; } FileSystem fs = FileSystem.get(getConf()); for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); for (int m = 0; m < num; m++) { mass[m] = sumLogProbs(mass[m], fin.readFloat()); } fin.close(); } return mass; }
From source file:edu.umd.windmemory.RunPersonalizedPageRankBasic.java
License:Apache License
private void phase2(int i, int j, float[] missings, String basePath, int numNodes, String sources) throws Exception { Job job = Job.getInstance(getConf()); job.setJobName("PageRank:Basic:iteration" + j + ":Phase2"); job.setJarByClass(RunPersonalizedPageRankBasic.class); String missing = Arrays.toString(missings); missing = missing.substring(1, missing.length() - 1); LOG.info("missing PageRank mass: " + missing); LOG.info("number of nodes: " + numNodes); String in = basePath + "/iter" + formatter.format(j) + "t"; String out = basePath + "/iter" + formatter.format(j); LOG.info("PageRank: iteration " + j + ": Phase2"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("MissingMass", missing); job.getConfiguration().setInt("NodeCount", numNodes); job.getConfiguration().set("source.nodes", sources); job.setNumReduceTasks(0);// www . j a va 2 s.c om FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MapPageRankMassDistributionClass.class); FileSystem.get(getConf()).delete(new Path(out), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); }
From source file:edu.umn.cs.spatialHadoop.nasa.HDFToText.java
License:Open Source License
/** * Performs an HDF to text operation as a MapReduce job and returns total * number of points generated./*from ww w .j a v a 2 s . co m*/ * @param inPath * @param outPath * @param datasetName * @param skipFillValue * @return * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static long HDFToTextMapReduce(Path inPath, Path outPath, String datasetName, boolean skipFillValue, OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(params, "HDFToText"); Configuration conf = job.getConfiguration(); job.setJarByClass(HDFToText.class); job.setJobName("HDFToText"); // Set Map function details job.setMapperClass(HDFToTextMap.class); job.setNumReduceTasks(0); // Set input information job.setInputFormatClass(SpatialInputFormat3.class); SpatialInputFormat3.setInputPaths(job, inPath); if (conf.get("shape") == null) conf.setClass("shape", NASAPoint.class, Shape.class); conf.set("dataset", datasetName); conf.setBoolean("skipfillvalue", skipFillValue); // Set output information job.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job, outPath); // Run the job boolean verbose = conf.getBoolean("verbose", false); job.waitForCompletion(verbose); Counters counters = job.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.visualization.SingleLevelPlot.java
License:Open Source License
/** * Generates a single level using a MapReduce job and returns the created job. * @param inFiles/*from ww w .ja v a 2 s . c om*/ * @param outFile * @param plotterClass * @param params * @return * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static Job plotMapReduce(Path[] inFiles, Path outFile, Class<? extends Plotter> plotterClass, OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { Plotter plotter; try { plotter = plotterClass.newInstance(); } catch (InstantiationException e) { throw new RuntimeException("Error creating rastierizer", e); } catch (IllegalAccessException e) { throw new RuntimeException("Error creating rastierizer", e); } Job job = new Job(params, "SingleLevelPlot"); job.setJarByClass(SingleLevelPlot.class); job.setJobName("SingleLevelPlot"); // Set plotter Configuration conf = job.getConfiguration(); Plotter.setPlotter(conf, plotterClass); // Set input file MBR Rectangle inputMBR = (Rectangle) params.getShape("mbr"); Rectangle drawRect = (Rectangle) params.getShape("rect"); if (inputMBR == null) inputMBR = drawRect != null ? drawRect : FileMBR.fileMBR(inFiles, params); OperationsParams.setShape(conf, InputMBR, inputMBR); if (drawRect != null) OperationsParams.setShape(conf, SpatialInputFormat3.InputQueryRange, drawRect); // Adjust width and height if aspect ratio is to be kept int imageWidth = conf.getInt("width", 1000); int imageHeight = conf.getInt("height", 1000); if (params.getBoolean("keepratio", true)) { // Adjust width and height to maintain aspect ratio if (inputMBR.getWidth() / inputMBR.getHeight() > (double) imageWidth / imageHeight) { // Fix width and change height imageHeight = (int) (inputMBR.getHeight() * imageWidth / inputMBR.getWidth()); // Make divisible by two for compatibility with ffmpeg if (imageHeight % 2 == 1) imageHeight--; conf.setInt("height", imageHeight); } else { imageWidth = (int) (inputMBR.getWidth() * imageHeight / inputMBR.getHeight()); conf.setInt("width", imageWidth); } } boolean merge = conf.getBoolean("merge", true); // Set input and output job.setInputFormatClass(SpatialInputFormat3.class); SpatialInputFormat3.setInputPaths(job, inFiles); if (conf.getBoolean("output", true)) { if (merge) { job.setOutputFormatClass(CanvasOutputFormat.class); conf.setClass("mapred.output.committer.class", CanvasOutputFormat.ImageWriterOld.class, org.apache.hadoop.mapred.OutputCommitter.class); } else { job.setOutputFormatClass(ImageOutputFormat.class); } CanvasOutputFormat.setOutputPath(job, outFile); } else { job.setOutputFormatClass(NullOutputFormat.class); } // Set mapper and reducer based on the partitioning scheme String partition = conf.get("partition", "none"); ClusterStatus clusterStatus = new JobClient(new JobConf()).getClusterStatus(); if (partition.equalsIgnoreCase("none")) { LOG.info("Using no-partition plot"); job.setMapperClass(NoPartitionPlotMap.class); job.setCombinerClass(NoPartitionPlotCombine.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(plotter.getCanvasClass()); if (merge) { int numSplits = new SpatialInputFormat3().getSplits(job).size(); job.setReducerClass(NoPartitionPlotReduce.class); // Set number of reduce tasks according to cluster status int maxReduce = Math.max(1, clusterStatus.getMaxReduceTasks() * 7 / 8); job.setNumReduceTasks(Math.max(1, Math.min(maxReduce, numSplits / maxReduce))); } else { job.setNumReduceTasks(0); } } else { LOG.info("Using repartition plot"); Partitioner partitioner; if (partition.equals("pixel")) { // Special case for pixel level partitioning as it depends on the // visualization parameters partitioner = new GridPartitioner(inputMBR, imageWidth, imageHeight); } else if (partition.equals("grid")) { int numBlocks = 0; for (Path in : inFiles) { FileSystem fs = in.getFileSystem(params); long size = FileUtil.getPathSize(fs, in); long blockSize = fs.getDefaultBlockSize(in); numBlocks += Math.ceil(size / (double) blockSize); } int numPartitions = numBlocks * 1000; int gridSize = (int) Math.ceil(Math.sqrt(numPartitions)); partitioner = new GridPartitioner(inputMBR, gridSize, gridSize); } else { // Use a standard partitioner as created by the indexer partitioner = Indexer.createPartitioner(inFiles, outFile, conf, partition); } Shape shape = params.getShape("shape"); job.setMapperClass(RepartitionPlotMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setReducerClass(RepartitionPlotReduce.class); // Set number of reducers according to cluster size job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10)); Partitioner.setPartitioner(conf, partitioner); } // Use multithreading in case the job is running locally conf.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors()); // Start the job if (params.getBoolean("background", false)) { // Run in background job.submit(); } else { job.waitForCompletion(params.getBoolean("verbose", false)); } return job; }
From source file:edu.usc.pgroup.louvain.hadoop.LouvainMR.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); int displayLevel = Integer.parseInt(args[2]); boolean v = false; if (args.length > 3) { v = Boolean.parseBoolean(args[3]); }/*from w w w . j a va2 s . com*/ conf.setInt(DISPLAY_LEVEL, displayLevel); conf.setBoolean(VERBOSE, v); conf.set(OUT_PATH, args[1]); Job job = new Job(conf); job.setJobName(TestJob.class.getName()); job.setJarByClass(TestJob.class); job.setMapperClass(MapCommunity.class); job.setReducerClass(ReduceCommunity.class); // Hello there ZipFileInputFormat! job.setInputFormatClass(GraphInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
From source file:edu.usc.pgroup.louvain.hadoop.TestJob.java
License:Apache License
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJobName(TestJob.class.getName()); job.setJarByClass(TestJob.class); job.setMapperClass(MapJob.class); job.setReducerClass(ReduceJob.class); // Hello there ZipFileInputFormat! job.setInputFormatClass(GraphInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true);//from w w w . j a va2s.c o m }
From source file:eu.edisonproject.classification.tfidf.mapreduce.CompetencesDistanceDriver.java
License:Apache License
@Override public int run(String[] args) { try {// www. java2 s .co m Configuration conf = HBaseConfiguration.create(); //additional output using TextOutputFormat. conf.set("file.names", args[3]); Job job = Job.getInstance(conf); //TableMapReduceUtil.addDependencyJars(job); job.setJarByClass(CompetencesDistanceDriver.class); //This row must be changed job.setJobName("Words Group By Title Driver"); Path inPath = new Path(args[0]); Path outPath = new Path(args[1]); Path competencesPath = new Path(args[2]); Path competencesPathHDFS = competencesPath; FileSystem fs = FileSystem.get(conf); if (!conf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { competencesPathHDFS = new Path(competencesPath.getName()); if (!fs.exists(competencesPathHDFS)) { fs.mkdirs(competencesPathHDFS); File[] stats = new File(competencesPath.toString()).listFiles(); for (File stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("csv")) { Path dest = new Path(competencesPathHDFS.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } } } job.addCacheFile(competencesPathHDFS.toUri()); FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); fs.delete(outPath, true); job.setMapperClass(CompetencesDistanceMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(CompetencesDistanceReducer.class); // job.setOutputFormatClass(TableOutputFormat.class); // job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "jobpostcompetence"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); String[] fileNames = args[3].split(","); for (String n : fileNames) { MultipleOutputs.addNamedOutput(job, n, TextOutputFormat.class, Text.class, Text.class); } return (job.waitForCompletion(true) ? 0 : 1); } catch (IOException | IllegalStateException | IllegalArgumentException | InterruptedException | ClassNotFoundException ex) { Logger.getLogger(CompetencesDistanceDriver.class.getName()).log(Level.SEVERE, null, ex); } return 0; }
From source file:eu.edisonproject.classification.tfidf.mapreduce.TermWordFrequency.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); Job job = Job.getInstance(jobconf); FileSystem fs = FileSystem.get(jobconf); fs.delete(new Path(args[1]), true); Path dictionary = new Path(args[0]); Path dictionaryHdfs = dictionary; Path localDocs = new Path(args[2]); Path hdfsDocs = localDocs;//from ww w . j av a 2 s .c om Path stopwordsLocal = new Path(args[3]); Path stopwordsHDFS = stopwordsLocal; if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { dictionaryHdfs = new Path(dictionary.getName()); if (!fs.exists(dictionaryHdfs)) { fs.copyFromLocalFile(dictionary, dictionaryHdfs); } hdfsDocs = new Path(localDocs.getName()); fs.mkdirs(hdfsDocs); fs.deleteOnExit(hdfsDocs); File[] stats = new File(localDocs.toString()).listFiles(); for (File stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) { Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } stopwordsHDFS = new Path(stopwordsLocal.getName()); if (!fs.exists(stopwordsHDFS)) { fs.copyFromLocalFile(stopwordsLocal, stopwordsHDFS); } } FileStatus stopwordsStatus = fs.getFileStatus(stopwordsHDFS); stopwordsHDFS = stopwordsStatus.getPath(); job.addCacheFile(stopwordsHDFS.toUri()); job.addCacheFile(hdfsDocs.toUri()); job.setJarByClass(TermWordFrequency.class); job.setJobName("Word Frequency Term Driver"); FileInputFormat.setInputPaths(job, dictionaryHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dictionaryHdfs); NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4])); NLineInputFormat.setMaxInputSplitSize(job, 500); job.setMapperClass(TermWordFrequencyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(TermWordFrequencyReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }