Example usage for org.apache.hadoop.mapreduce Job setJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setJobName.

Prototype

public void setJobName(String name) throws IllegalStateException

Source Link

Document

Set the user-specified job name.

Usage

From source file:edu.umd.windmemory.PMIPairsR.java

License:Apache License

/**
* Runs this tool.//from w  w  w  . j  a va2  s  . c o m
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + PMIPairsR.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PMIPairsR.class.getSimpleName());
    job.setJarByClass(PMIPairsR.class);
    // Delete the output directory if it exists already.
    Path interDir = new Path("temp");
    FileSystem.get(getConf()).delete(interDir, true);

    // job.setNumMapTasks(reduceTasks);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, interDir);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyFirstMapper.class);
    // job.setCombinerClass(MyFirstReducer.class);
    job.setReducerClass(MyFirstReducer.class);
    job.setPartitionerClass(MyFirstPartitioner.class);

    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PMIPairsR.class.getSimpleName());
    job2.setJarByClass(PMIPairsR.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    // job2.getConfiguration().set("path", "temp");
    // job2.getConfiguration().setInt("num", reduceTasks);

    job2.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));

    job2.setMapOutputKeyClass(PairOfStrings.class);
    job2.setMapOutputValueClass(IntWritable.class);
    job2.setOutputKeyClass(PairOfStrings.class);
    job2.setOutputValueClass(DoubleWritable.class);

    job2.setMapperClass(MySecondMapper.class);
    // job2.setCombinerClass(MySecondCombiner.class);
    job2.setReducerClass(MySecondReducer.class);
    job2.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job2.addCacheFile(new URI("temp/part-r-00000"));
    job.waitForCompletion(true);
    job2.waitForCompletion(true);
    // FileSystem.get(getConf()).delete(interDir, true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.windmemory.PMIStripes.java

License:Apache License

/**
 * Runs this tool./*from w ww.ja v a2  s.c  o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + PMIPairs.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PMIPairs.class.getSimpleName());
    job.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path interDir = new Path("temp");
    FileSystem.get(getConf()).delete(interDir, true);

    // job.setNumMapTasks(reduceTasks);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, interDir);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyFirstMapper.class);
    job.setCombinerClass(MyFirstReducer.class);
    job.setReducerClass(MyFirstReducer.class);
    job.setPartitionerClass(MyFirstPartitioner.class);

    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PMIPairs.class.getSimpleName());
    job2.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    // job2.getConfiguration().set("path", interDir.toString());
    // job2.getConfiguration().setInt("num", reduceTasks);

    job2.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));

    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(HMapStIW.class);
    job2.setOutputKeyClass(PairOfStrings.class);
    job2.setOutputValueClass(DoubleWritable.class);

    job2.setMapperClass(MySecondMapper.class);
    job2.setCombinerClass(MySecondCombiner.class);
    job2.setReducerClass(MySecondReducer.class);
    job2.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job2.addCacheFile(new URI("temp/part-r-00000"));
    job.waitForCompletion(true);
    job2.waitForCompletion(true);
    // FileSystem.get(getConf()).delete(interDir, true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.windmemory.RunPersonalizedPageRankBasic.java

License:Apache License

private float[] phase1(int i, int j, String basePath, int numNodes, String sources) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    job.setJarByClass(RunPersonalizedPageRankBasic.class);

    String in = basePath + "/iter" + formatter.format(i);
    String out = basePath + "/iter" + formatter.format(j) + "t";
    String outm = out + "-mass";

    // We need to actually count the number of part files to get the number of partitions (because
    // the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*from w  w w. j a  va 2s . c  om*/
    }

    LOG.info("PageRank: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + numNodes);
    LOG.info(" - source nodes" + sources);
    // LOG.info(" - useInmapCombiner: " + useInMapperCombiner);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    job.getConfiguration().setInt("NodeCount", numNodes);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");
    job.getConfiguration().set("PageRankMassPath", outm);
    job.getConfiguration().set("source.nodes", sources);
    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MapClass.class);

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);
    FileSystem.get(getConf()).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    int num = sources.split(",").length;
    float[] mass = new float[num];
    for (int m = 0; m < num; m++) {
        mass[m] = Float.NEGATIVE_INFINITY;
    }
    FileSystem fs = FileSystem.get(getConf());
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        for (int m = 0; m < num; m++) {
            mass[m] = sumLogProbs(mass[m], fin.readFloat());
        }
        fin.close();
    }

    return mass;
}

From source file:edu.umd.windmemory.RunPersonalizedPageRankBasic.java

License:Apache License

private void phase2(int i, int j, float[] missings, String basePath, int numNodes, String sources)
        throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase2");
    job.setJarByClass(RunPersonalizedPageRankBasic.class);
    String missing = Arrays.toString(missings);
    missing = missing.substring(1, missing.length() - 1);
    LOG.info("missing PageRank mass: " + missing);
    LOG.info("number of nodes: " + numNodes);

    String in = basePath + "/iter" + formatter.format(j) + "t";
    String out = basePath + "/iter" + formatter.format(j);

    LOG.info("PageRank: iteration " + j + ": Phase2");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);

    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("MissingMass", missing);
    job.getConfiguration().setInt("NodeCount", numNodes);
    job.getConfiguration().set("source.nodes", sources);

    job.setNumReduceTasks(0);// www  . j a  va  2  s.c om

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MapPageRankMassDistributionClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}

From source file:edu.umn.cs.spatialHadoop.nasa.HDFToText.java

License:Open Source License

/**
 * Performs an HDF to text operation as a MapReduce job and returns total
 * number of points generated./*from  ww  w  .j a  v a  2 s  .  co m*/
 * @param inPath
 * @param outPath
 * @param datasetName
 * @param skipFillValue
 * @return
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static long HDFToTextMapReduce(Path inPath, Path outPath, String datasetName, boolean skipFillValue,
        OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException {
    Job job = new Job(params, "HDFToText");
    Configuration conf = job.getConfiguration();
    job.setJarByClass(HDFToText.class);
    job.setJobName("HDFToText");

    // Set Map function details
    job.setMapperClass(HDFToTextMap.class);
    job.setNumReduceTasks(0);

    // Set input information
    job.setInputFormatClass(SpatialInputFormat3.class);
    SpatialInputFormat3.setInputPaths(job, inPath);
    if (conf.get("shape") == null)
        conf.setClass("shape", NASAPoint.class, Shape.class);
    conf.set("dataset", datasetName);
    conf.setBoolean("skipfillvalue", skipFillValue);

    // Set output information
    job.setOutputFormatClass(TextOutputFormat3.class);
    TextOutputFormat3.setOutputPath(job, outPath);

    // Run the job
    boolean verbose = conf.getBoolean("verbose", false);
    job.waitForCompletion(verbose);
    Counters counters = job.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.visualization.SingleLevelPlot.java

License:Open Source License

/**
 * Generates a single level using a MapReduce job and returns the created job.
 * @param inFiles/*from   ww w .ja  v a 2 s  .  c  om*/
 * @param outFile
 * @param plotterClass
 * @param params
 * @return
 * @throws IOException
 * @throws InterruptedException 
 * @throws ClassNotFoundException 
 */
public static Job plotMapReduce(Path[] inFiles, Path outFile, Class<? extends Plotter> plotterClass,
        OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException {
    Plotter plotter;
    try {
        plotter = plotterClass.newInstance();
    } catch (InstantiationException e) {
        throw new RuntimeException("Error creating rastierizer", e);
    } catch (IllegalAccessException e) {
        throw new RuntimeException("Error creating rastierizer", e);
    }

    Job job = new Job(params, "SingleLevelPlot");
    job.setJarByClass(SingleLevelPlot.class);
    job.setJobName("SingleLevelPlot");
    // Set plotter
    Configuration conf = job.getConfiguration();
    Plotter.setPlotter(conf, plotterClass);
    // Set input file MBR
    Rectangle inputMBR = (Rectangle) params.getShape("mbr");
    Rectangle drawRect = (Rectangle) params.getShape("rect");
    if (inputMBR == null)
        inputMBR = drawRect != null ? drawRect : FileMBR.fileMBR(inFiles, params);
    OperationsParams.setShape(conf, InputMBR, inputMBR);
    if (drawRect != null)
        OperationsParams.setShape(conf, SpatialInputFormat3.InputQueryRange, drawRect);

    // Adjust width and height if aspect ratio is to be kept
    int imageWidth = conf.getInt("width", 1000);
    int imageHeight = conf.getInt("height", 1000);
    if (params.getBoolean("keepratio", true)) {
        // Adjust width and height to maintain aspect ratio
        if (inputMBR.getWidth() / inputMBR.getHeight() > (double) imageWidth / imageHeight) {
            // Fix width and change height
            imageHeight = (int) (inputMBR.getHeight() * imageWidth / inputMBR.getWidth());
            // Make divisible by two for compatibility with ffmpeg
            if (imageHeight % 2 == 1)
                imageHeight--;
            conf.setInt("height", imageHeight);
        } else {
            imageWidth = (int) (inputMBR.getWidth() * imageHeight / inputMBR.getHeight());
            conf.setInt("width", imageWidth);
        }
    }

    boolean merge = conf.getBoolean("merge", true);
    // Set input and output
    job.setInputFormatClass(SpatialInputFormat3.class);
    SpatialInputFormat3.setInputPaths(job, inFiles);
    if (conf.getBoolean("output", true)) {
        if (merge) {
            job.setOutputFormatClass(CanvasOutputFormat.class);
            conf.setClass("mapred.output.committer.class", CanvasOutputFormat.ImageWriterOld.class,
                    org.apache.hadoop.mapred.OutputCommitter.class);
        } else {
            job.setOutputFormatClass(ImageOutputFormat.class);
        }
        CanvasOutputFormat.setOutputPath(job, outFile);
    } else {
        job.setOutputFormatClass(NullOutputFormat.class);
    }

    // Set mapper and reducer based on the partitioning scheme
    String partition = conf.get("partition", "none");
    ClusterStatus clusterStatus = new JobClient(new JobConf()).getClusterStatus();
    if (partition.equalsIgnoreCase("none")) {
        LOG.info("Using no-partition plot");
        job.setMapperClass(NoPartitionPlotMap.class);
        job.setCombinerClass(NoPartitionPlotCombine.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(plotter.getCanvasClass());
        if (merge) {
            int numSplits = new SpatialInputFormat3().getSplits(job).size();
            job.setReducerClass(NoPartitionPlotReduce.class);
            // Set number of reduce tasks according to cluster status
            int maxReduce = Math.max(1, clusterStatus.getMaxReduceTasks() * 7 / 8);
            job.setNumReduceTasks(Math.max(1, Math.min(maxReduce, numSplits / maxReduce)));
        } else {
            job.setNumReduceTasks(0);
        }
    } else {
        LOG.info("Using repartition plot");
        Partitioner partitioner;
        if (partition.equals("pixel")) {
            // Special case for pixel level partitioning as it depends on the
            // visualization parameters
            partitioner = new GridPartitioner(inputMBR, imageWidth, imageHeight);
        } else if (partition.equals("grid")) {
            int numBlocks = 0;
            for (Path in : inFiles) {
                FileSystem fs = in.getFileSystem(params);
                long size = FileUtil.getPathSize(fs, in);
                long blockSize = fs.getDefaultBlockSize(in);
                numBlocks += Math.ceil(size / (double) blockSize);
            }
            int numPartitions = numBlocks * 1000;
            int gridSize = (int) Math.ceil(Math.sqrt(numPartitions));
            partitioner = new GridPartitioner(inputMBR, gridSize, gridSize);
        } else {
            // Use a standard partitioner as created by the indexer
            partitioner = Indexer.createPartitioner(inFiles, outFile, conf, partition);
        }
        Shape shape = params.getShape("shape");
        job.setMapperClass(RepartitionPlotMap.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(shape.getClass());
        job.setReducerClass(RepartitionPlotReduce.class);
        // Set number of reducers according to cluster size
        job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));
        Partitioner.setPartitioner(conf, partitioner);
    }

    // Use multithreading in case the job is running locally
    conf.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors());

    // Start the job
    if (params.getBoolean("background", false)) {
        // Run in background
        job.submit();
    } else {
        job.waitForCompletion(params.getBoolean("verbose", false));
    }
    return job;
}

From source file:edu.usc.pgroup.louvain.hadoop.LouvainMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    int displayLevel = Integer.parseInt(args[2]);

    boolean v = false;
    if (args.length > 3) {
        v = Boolean.parseBoolean(args[3]);
    }/*from w w  w .  j a  va2 s  .  com*/

    conf.setInt(DISPLAY_LEVEL, displayLevel);
    conf.setBoolean(VERBOSE, v);
    conf.set(OUT_PATH, args[1]);

    Job job = new Job(conf);
    job.setJobName(TestJob.class.getName());
    job.setJarByClass(TestJob.class);
    job.setMapperClass(MapCommunity.class);
    job.setReducerClass(ReduceCommunity.class);

    // Hello there ZipFileInputFormat!
    job.setInputFormatClass(GraphInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BytesWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    TextOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
}

From source file:edu.usc.pgroup.louvain.hadoop.TestJob.java

License:Apache License

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    Configuration conf = new Configuration();
    Job job = new Job(conf);
    job.setJobName(TestJob.class.getName());
    job.setJarByClass(TestJob.class);
    job.setMapperClass(MapJob.class);
    job.setReducerClass(ReduceJob.class);

    // Hello there ZipFileInputFormat!
    job.setInputFormatClass(GraphInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    TextOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);//from w w  w .  j  a  va2s.c o m

}

From source file:eu.edisonproject.classification.tfidf.mapreduce.CompetencesDistanceDriver.java

License:Apache License

@Override
public int run(String[] args) {
    try {//  www.  java2  s  .co  m
        Configuration conf = HBaseConfiguration.create();
        //additional output using TextOutputFormat.
        conf.set("file.names", args[3]);

        Job job = Job.getInstance(conf);
        //TableMapReduceUtil.addDependencyJars(job); 
        job.setJarByClass(CompetencesDistanceDriver.class);
        //This row must be changed
        job.setJobName("Words Group By Title Driver");

        Path inPath = new Path(args[0]);
        Path outPath = new Path(args[1]);

        Path competencesPath = new Path(args[2]);
        Path competencesPathHDFS = competencesPath;
        FileSystem fs = FileSystem.get(conf);

        if (!conf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) {
            competencesPathHDFS = new Path(competencesPath.getName());
            if (!fs.exists(competencesPathHDFS)) {
                fs.mkdirs(competencesPathHDFS);
                File[] stats = new File(competencesPath.toString()).listFiles();
                for (File stat : stats) {
                    Path filePath = new Path(stat.getAbsolutePath());
                    if (FilenameUtils.getExtension(filePath.getName()).endsWith("csv")) {
                        Path dest = new Path(competencesPathHDFS.toUri() + "/" + filePath.getName());
                        fs.copyFromLocalFile(filePath, dest);
                    }
                }
            }
        }
        job.addCacheFile(competencesPathHDFS.toUri());

        FileInputFormat.setInputPaths(job, inPath);

        FileOutputFormat.setOutputPath(job, outPath);
        fs.delete(outPath, true);

        job.setMapperClass(CompetencesDistanceMapper.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(CompetencesDistanceReducer.class);
        //            job.setOutputFormatClass(TableOutputFormat.class);
        //            job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "jobpostcompetence");
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        String[] fileNames = args[3].split(",");
        for (String n : fileNames) {
            MultipleOutputs.addNamedOutput(job, n, TextOutputFormat.class, Text.class, Text.class);
        }

        return (job.waitForCompletion(true) ? 0 : 1);
    } catch (IOException | IllegalStateException | IllegalArgumentException | InterruptedException
            | ClassNotFoundException ex) {
        Logger.getLogger(CompetencesDistanceDriver.class.getName()).log(Level.SEVERE, null, ex);
    }
    return 0;
}

From source file:eu.edisonproject.classification.tfidf.mapreduce.TermWordFrequency.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration jobconf = getConf();
    Job job = Job.getInstance(jobconf);
    FileSystem fs = FileSystem.get(jobconf);
    fs.delete(new Path(args[1]), true);
    Path dictionary = new Path(args[0]);
    Path dictionaryHdfs = dictionary;

    Path localDocs = new Path(args[2]);
    Path hdfsDocs = localDocs;//from   ww w . j  av  a  2  s  .c om

    Path stopwordsLocal = new Path(args[3]);
    Path stopwordsHDFS = stopwordsLocal;
    if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) {
        dictionaryHdfs = new Path(dictionary.getName());
        if (!fs.exists(dictionaryHdfs)) {
            fs.copyFromLocalFile(dictionary, dictionaryHdfs);
        }
        hdfsDocs = new Path(localDocs.getName());
        fs.mkdirs(hdfsDocs);
        fs.deleteOnExit(hdfsDocs);

        File[] stats = new File(localDocs.toString()).listFiles();

        for (File stat : stats) {
            Path filePath = new Path(stat.getAbsolutePath());
            if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) {
                Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName());
                fs.copyFromLocalFile(filePath, dest);
            }
        }
        stopwordsHDFS = new Path(stopwordsLocal.getName());
        if (!fs.exists(stopwordsHDFS)) {
            fs.copyFromLocalFile(stopwordsLocal, stopwordsHDFS);
        }
    }

    FileStatus stopwordsStatus = fs.getFileStatus(stopwordsHDFS);
    stopwordsHDFS = stopwordsStatus.getPath();
    job.addCacheFile(stopwordsHDFS.toUri());

    job.addCacheFile(hdfsDocs.toUri());

    job.setJarByClass(TermWordFrequency.class);
    job.setJobName("Word Frequency Term Driver");

    FileInputFormat.setInputPaths(job, dictionaryHdfs);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //        job.setInputFormatClass(TextInputFormat.class);
    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, dictionaryHdfs);
    NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4]));
    NLineInputFormat.setMaxInputSplitSize(job, 500);

    job.setMapperClass(TermWordFrequencyMapper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Integer.class);
    job.setReducerClass(TermWordFrequencyReducer.class);

    return (job.waitForCompletion(true) ? 0 : 1);

}