Example usage for org.apache.hadoop.mapred JobConf setJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setJobName.

Prototype

public void setJobName(String name)

Source Link

Document

Set the user-specified job name.

Usage

From source file:edu.umd.cloud9.webgraph.ClueExtractLinks.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), ClueExtractLinks.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");
    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    if (!fs.exists(new Path(mappingFile))) {
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
    }/* www .j a  va2  s.c om*/

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    conf.setJobName("ClueExtractLinks");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(numMappers);
    conf.setNumReduceTasks(numReducers);
    // TODO: to read!!
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ClueExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }
    return 0;
}

From source file:edu.umd.cloud9.webgraph.CollectHostnames.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), CollectHostnames.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("CollectHostnames");
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);//from   www . j  a va2s  .  c o  m
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(PairOfIntString.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    sLogger.info("PropagateHostname");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        sLogger.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.ComputeWeight.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), ComputeWeight.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("ComputeWeights");
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);/*from  w ww. ja  v a2s. co m*/
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ComputeWeight");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.BuildAnchorTextForwardIndex.java

License:Apache License

/**
 * Runs this tool.//  w  w w  .  ja  v a  2  s  .c o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    JobConf conf = new JobConf(getConf());
    FileSystem fs = FileSystem.get(conf);

    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];

    LOG.info("Tool name: BuildAnchorTextForwardIndex");
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setJobName("BuildAnchorTextForwardIndex");

    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(IndexableAnchorTextForwardIndex.class.getName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 1000 == 0) {
            LOG.info(cnt + " blocks written");
        }

    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.BuildIndexableAnchorCollection.java

License:Apache License

/**
 * Runs this tool./* w ww . j  a  va 2  s . c o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length < 5) {
        printUsage();
        return -1;
    }

    JobConf conf = new JobConf(getConf());
    FileSystem fs = FileSystem.get(conf);

    String collectionPath = DriverUtil.argValue(args, DriverUtil.CL_INPUT);
    String outputPath = DriverUtil.argValue(args, DriverUtil.CL_OUTPUT);
    String docnoMappingClass = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING_CLASS);
    String docnoMapping = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING);
    int numReducers = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_NUMBER_OF_REDUCERS));
    if (DriverUtil.argExists(args, DriverUtil.CL_MAX_LENGTH)) {
        conf.setInt("Cloud9.maxContentLength",
                Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_MAX_LENGTH)));
    }
    conf.set("Cloud9.DocnoMappingClass", docnoMappingClass);

    LOG.info("Tool name: BuildAnchorTextForwardIndex");
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno-mapping class: " + docnoMappingClass);
    LOG.info(" - docno-mapping file: " + docnoMapping);
    if (args.length == 6) {
        LOG.info(" - maximum content length: " + conf.getInt("Cloud9.maxContentLength", 0));
    }

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setJobName("BuildIndexableAnchorCollection");
    conf.setJarByClass(BuildIndexableAnchorCollection.class);

    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(numReducers);
    DistributedCache.addCacheFile(new URI(docnoMapping), conf);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, new Path(collectionPath));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IndexableAnchorText.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);
    RunningJob job = JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.ClueWebAnchorTextForwardIndexHttpServer.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 3) {
        System.out.println("usage: [index-file] [docno-mapping-data-files] [clue-forward-index-root]");
        //[clue-forward-index-root: /shared/ClueWeb09/collection.compressed.block/
        System.exit(-1);// w w  w .  j a v  a2 s  .c o  m
    }

    String indexFile = otherArgs[0];
    String mappingFile = otherArgs[1];
    String clueIndexRoot = otherArgs[2].endsWith("/") ? otherArgs[2] : otherArgs[2] + "/";

    String cluewebForwardIndex = "";
    for (int i = 1; i < 10; i++)
        cluewebForwardIndex += clueIndexRoot + "findex.en.0" + i + ".dat" + SEPARATOR + " ";
    cluewebForwardIndex += clueIndexRoot + "findex.en.10.dat";

    LOG.info("Launching DocumentForwardIndexHttpServer");
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - ClueWeb09 index root:" + clueIndexRoot);

    FileSystem fs = FileSystem.get(conf);

    Random rand = new Random();
    int r = rand.nextInt();

    // this tmp file as a rendezvous point
    Path tmpPath = new Path("/tmp/" + r);

    if (fs.exists(tmpPath)) {
        fs.delete(tmpPath, true);
    }

    JobConf job = new JobConf(conf, ClueWebAnchorTextForwardIndexHttpServer.class);

    job.setJobName("ForwardIndexServer:" + indexFile);

    job.set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumMapTasks(1);
    job.setNumReduceTasks(0);

    job.setInputFormat(NullInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapperClass(ServerMapper.class);

    job.set("IndexFile", indexFile);
    job.set("DocnoMappingDataFile", mappingFile);
    job.set("TmpPath", tmpPath.toString());
    job.set("ClueWebIndexFiles", cluewebForwardIndex);

    JobClient client = new JobClient(job);
    client.submitJob(job);

    LOG.info("Waiting for server to start up...");

    while (!fs.exists(tmpPath)) {
        Thread.sleep(50000);
        LOG.info("...");
    }

    FSDataInputStream in = fs.open(tmpPath);
    String host = in.readUTF();
    in.close();

    LOG.info("host: " + host);
    LOG.info("port: 8888");
}

From source file:edu.umd.cloud9.webgraph.driver.GenerateTabDelimitedWebGraph.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        printUsage();/*  w w  w  .j  a  va2 s. c  o m*/
        return -1;
    }

    JobConf conf = new JobConf(getConf(), GenerateTabDelimitedWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    String inPath = DriverUtil.argValue(args, "-webgraph") + "/" + DriverUtil.OUTPUT_WEBGRAPH;
    String outPath = DriverUtil.argValue(args, "-output");

    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);

    if (fs.exists(outputPath)) {
        fs.delete(outputPath);
    }

    conf.setJobName("TabDelimWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(MyMapper.class);

    JobClient.runJob(conf);
    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.SortWebGraph.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();//from  w w w  .  j  a  v  a 2 s.co m
        return -1;
    }

    JobConf conf = new JobConf(getConf(), SortWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    String inputPath = args[0];
    String outputPath = args[1];
    int numberOfDocuments = Integer.parseInt(args[2]);
    int numMappers = 1;
    int numReducers = Integer.parseInt(args[3]);

    conf.setJobName("SortWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    if (numberOfDocuments == 0) {
        numberOfDocuments = DEFAULT_NUMBER_OF_DOCUMENTS;
    }
    conf.setInt("Cloud9.NumberOfDocuments", numberOfDocuments);
    conf.setNumMapTasks(numMappers);
    conf.setNumReduceTasks(numReducers);
    conf.setMapperClass(IdentityMapper.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("SortAnchorText");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of documents: " + conf.getInt("Cloud9.NumberOfDocuments", DEFAULT_NUMBER_OF_DOCUMENTS));
    fs.delete(new Path(outputPath));
    JobClient.runJob(conf);
    return 0;
}

From source file:edu.umd.cloud9.webgraph.ExtractLinks.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), ExtractLinks.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");
    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    if (!fs.exists(new Path(mappingFile)))
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    conf.setJobName("ExtractLinks");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);//from w ww. j a v  a  2 s .  c o m
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);

    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umn.cs.spatialHadoop.nasa.DistributedAggregateSpatioTemporalIndexer.java

License:Open Source License

/**
 * Build a bunch of AggregateQuadTrees using a Map-Reduce job
 * /*w  w w.  j  a v  a  2s.com*/
 * @param inputPathsDictionaryPath
 * @param params
 * @throws IOException
 */
public static void aggregateQuadTreeMapReduce(Path inputPathsDictionaryPath, OperationsParams params)
        throws IOException {

    // configure a map-reduce job
    JobConf job = new JobConf(params, DistributedAggregateSpatioTemporalIndexer.class);

    Path outputPath;
    String outputPathPrefix = "aggQuadTree_";
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(outputPathPrefix + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("AggregateQuadTree");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(AggregateQuadTreeMaper.class);
    job.set(HDFSIndexPath, hdfsIndexPath.toString());

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    TextInputFormat.setInputPaths(job, inputPathsDictionaryPath);
    TextOutputFormat.setOutputPath(job, outputPath);

    if (job.getBoolean("local", false)) {
        // Enforce local execution if explicitly set by user or for small
        // files
        job.set("mapred.job.tracker", "local");
        // Use multithreading too
        job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, 16);
    }
    job.setNumReduceTasks(0);

    // Submit the job
    JobClient.runJob(job);

    outFs.delete(outputPath, true);
}