Example usage for org.apache.hadoop.mapred JobConf setJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setJobName.

Prototype

public void setJobName(String name)

Source Link

Document

Set the user-specified job name.

Usage

From source file:edu.umd.cloud9.io.benchmark.HadoopSortRandomPairsOfInts.java

License:Apache License

/**
 * Runs this benchmark.//from w  w w. j  ava  2 s. com
 */
public static void main(String[] args) throws IOException {
    String inputPath = "random-pairs.seq";
    String outputPath = "random-pairs.sorted";
    int numMapTasks = 1;
    int numReduceTasks = 1;

    JobConf conf = new JobConf(HadoopSortRandomPairsOfInts.class);
    conf.setJobName("SortRandomPairsOfInts");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(PairOfInts.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime;
    double duration;

    startTime = System.currentTimeMillis();

    JobClient.runJob(conf);

    duration = (System.currentTimeMillis() - startTime) / 1000.0;
    System.out.println("Job took " + duration + " seconds");
}

From source file:edu.umd.cloud9.pagerank.BuildPageRankRecords.java

License:Apache License

/**
 * Runs this tool./*from   w ww.  j a  va2s.c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int n = Integer.parseInt(args[2]);

    sLogger.info("Tool name: BuildPageRankRecords");
    sLogger.info(" - inputDir: " + inputPath);
    sLogger.info(" - outputDir: " + outputPath);
    sLogger.info(" - numNodes: " + n);

    JobConf conf = new JobConf(BuildPageRankRecords.class);
    conf.setJobName("PackageLinkGraph");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    conf.setInt("NodeCnt", n);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    TextInputFormat.addInputPath(conf, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.pagerank.FindMaxPageRankNodes.java

License:Apache License

/**
 * Runs this tool./*from  w w  w.j  a  v a  2  s  .  c o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int n = Integer.parseInt(args[2]);

    sLogger.info("Tool name: FindMaxPageRankNodes");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);
    sLogger.info(" - n: " + n);

    JobConf conf = new JobConf(FindMaxPageRankNodes.class);
    conf.setJobName("FindMaxPageRankNodes");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.setInt("n", n);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(FloatWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(FloatWritable.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.pagerank.PartitionGraph.java

License:Apache License

public int run(String[] args) throws IOException {
    if (args.length != 5) {
        printUsage();//from   w ww  . j a v a  2 s.c  o m
        return -1;
    }

    String inPath = args[0];
    String outPath = args[1];
    int numParts = Integer.parseInt(args[2]);
    boolean useRange = Integer.parseInt(args[3]) != 0;
    int nodeCount = Integer.parseInt(args[4]);

    sLogger.info("Tool name: PartitionGraph");
    sLogger.info(" - inputDir: " + inPath);
    sLogger.info(" - outputDir: " + outPath);
    sLogger.info(" - numPartitions: " + numParts);
    sLogger.info(" - useRange?: " + useRange);
    sLogger.info(" - nodeCnt: " + nodeCount);

    JobConf conf = new JobConf(PartitionGraph.class);

    conf.setJobName("Partition Graph " + numParts);
    conf.setNumReduceTasks(numParts);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("NodeCount", nodeCount);

    FileInputFormat.setInputPaths(conf, new Path(inPath));
    FileOutputFormat.setOutputPath(conf, new Path(outPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    FileSystem.get(conf).delete(new Path(outPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*from   www.  j a v a 2  s  .c  om*/
    }

    sLogger.info("PageRank: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    conf.setInt("NodeCount", n);

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(conf);
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java

License:Apache License

private void phase2(String path, int i, int j, int n, float missing) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    sLogger.info("missing PageRank mass: " + missing);
    sLogger.info("number of nodes: " + n);

    String in = path + "/iter" + sFormat.format(j) + "t";
    String out = path + "/iter" + sFormat.format(j);

    sLogger.info("PageRank: iteration " + j + ": Phase2");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);

    int numMapTasks = FileSystem.get(conf).listStatus(new Path(in)).length;
    int numReduceTasks = 0;

    conf.setJobName("PageRank:Basic:iteration" + j + ":Phase2");
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.setFloat("MissingMass", (float) missing);
    conf.setInt("NodeCount", n);

    conf.setNumMapTasks(numMapTasks);/*from  www.j  a  v  a2s. c o  m*/
    conf.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapPageRankMassDistributionClass.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    FileSystem.get(conf).delete(new Path(out), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*from w  w w . jav  a 2  s .c  o  m*/
    }

    conf.setInt("NodeCount", n);

    Partitioner p = null;

    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }

    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }

    sLogger.info(sb.toString().trim());

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - numPartitions: " + numPartitions);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(FloatWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private void phase2(String path, int i, int j, int n, float missing) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    sLogger.info("missing PageRank mass: " + missing);
    sLogger.info("number of nodes: " + n);

    String in = path + "/iter" + sFormat.format(j) + "t";
    String out = path + "/iter" + sFormat.format(j);

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase2");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);

    int numMapTasks = FileSystem.get(conf).listStatus(new Path(in)).length;
    int numReduceTasks = 0;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase2");
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    conf.setNumMapTasks(numMapTasks);/* w ww .ja  va2 s  . co  m*/
    conf.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapPageRankMassDistributionClass.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setFloat("MissingMass", (float) missing);
    conf.setInt("NodeCount", n);

    FileSystem.get(conf).delete(new Path(out), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.webgraph.BuildReverseWebGraph.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildReverseWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("ReverseWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(numMappers);/*from   ww  w.  j a  va2  s . com*/
    conf.setNumReduceTasks(numReducers);
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(Reduce.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("BuildReverseWebGraph");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.BuildWebGraph.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("ConstructWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(numMappers);//from w w  w .  java  2 s.  c  om
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("BuildWebGraph");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}