Example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the job output data.

Usage

From source file:edu.umd.cloud9.webgraph.BuildWebGraph.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("ConstructWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(numMappers);/*  w  w w. j  ava2s.  co m*/
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("BuildWebGraph");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.ClueExtractLinks.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), ClueExtractLinks.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");
    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    if (!fs.exists(new Path(mappingFile))) {
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
    }/*  ww w.  ja  va2 s  . c  o m*/

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    conf.setJobName("ClueExtractLinks");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(numMappers);
    conf.setNumReduceTasks(numReducers);
    // TODO: to read!!
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ClueExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }
    return 0;
}

From source file:edu.umd.cloud9.webgraph.CollectHostnames.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), CollectHostnames.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("CollectHostnames");
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);// ww  w. j  a va 2s  .c o m
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(PairOfIntString.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    sLogger.info("PropagateHostname");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        sLogger.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.ComputeWeight.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), ComputeWeight.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("ComputeWeights");
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);/*from  w  w w .  ja v a2s .  com*/
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ComputeWeight");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.BuildAnchorTextForwardIndex.java

License:Apache License

/**
 * Runs this tool.//  w w  w  .  j a  v  a 2  s  . c o m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    JobConf conf = new JobConf(getConf());
    FileSystem fs = FileSystem.get(conf);

    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];

    LOG.info("Tool name: BuildAnchorTextForwardIndex");
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setJobName("BuildAnchorTextForwardIndex");

    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(IndexableAnchorTextForwardIndex.class.getName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 1000 == 0) {
            LOG.info(cnt + " blocks written");
        }

    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.BuildIndexableAnchorCollection.java

License:Apache License

/**
 * Runs this tool./*from  w w w  .  j av  a 2s.c o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length < 5) {
        printUsage();
        return -1;
    }

    JobConf conf = new JobConf(getConf());
    FileSystem fs = FileSystem.get(conf);

    String collectionPath = DriverUtil.argValue(args, DriverUtil.CL_INPUT);
    String outputPath = DriverUtil.argValue(args, DriverUtil.CL_OUTPUT);
    String docnoMappingClass = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING_CLASS);
    String docnoMapping = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING);
    int numReducers = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_NUMBER_OF_REDUCERS));
    if (DriverUtil.argExists(args, DriverUtil.CL_MAX_LENGTH)) {
        conf.setInt("Cloud9.maxContentLength",
                Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_MAX_LENGTH)));
    }
    conf.set("Cloud9.DocnoMappingClass", docnoMappingClass);

    LOG.info("Tool name: BuildAnchorTextForwardIndex");
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno-mapping class: " + docnoMappingClass);
    LOG.info(" - docno-mapping file: " + docnoMapping);
    if (args.length == 6) {
        LOG.info(" - maximum content length: " + conf.getInt("Cloud9.maxContentLength", 0));
    }

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setJobName("BuildIndexableAnchorCollection");
    conf.setJarByClass(BuildIndexableAnchorCollection.class);

    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(numReducers);
    DistributedCache.addCacheFile(new URI(docnoMapping), conf);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, new Path(collectionPath));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IndexableAnchorText.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);
    RunningJob job = JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.GenerateTabDelimitedWebGraph.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        printUsage();//  ww w  .  j a v  a2s .  c om
        return -1;
    }

    JobConf conf = new JobConf(getConf(), GenerateTabDelimitedWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    String inPath = DriverUtil.argValue(args, "-webgraph") + "/" + DriverUtil.OUTPUT_WEBGRAPH;
    String outPath = DriverUtil.argValue(args, "-output");

    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);

    if (fs.exists(outputPath)) {
        fs.delete(outputPath);
    }

    conf.setJobName("TabDelimWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(MyMapper.class);

    JobClient.runJob(conf);
    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.SortWebGraph.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();// w  w  w.j a  v a2  s  . com
        return -1;
    }

    JobConf conf = new JobConf(getConf(), SortWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    String inputPath = args[0];
    String outputPath = args[1];
    int numberOfDocuments = Integer.parseInt(args[2]);
    int numMappers = 1;
    int numReducers = Integer.parseInt(args[3]);

    conf.setJobName("SortWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    if (numberOfDocuments == 0) {
        numberOfDocuments = DEFAULT_NUMBER_OF_DOCUMENTS;
    }
    conf.setInt("Cloud9.NumberOfDocuments", numberOfDocuments);
    conf.setNumMapTasks(numMappers);
    conf.setNumReduceTasks(numReducers);
    conf.setMapperClass(IdentityMapper.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("SortAnchorText");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of documents: " + conf.getInt("Cloud9.NumberOfDocuments", DEFAULT_NUMBER_OF_DOCUMENTS));
    fs.delete(new Path(outputPath));
    JobClient.runJob(conf);
    return 0;
}

From source file:edu.umd.cloud9.webgraph.ExtractLinks.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), ExtractLinks.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");
    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    if (!fs.exists(new Path(mappingFile)))
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    conf.setJobName("ExtractLinks");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);//from  w  ww  . j a  v  a 2s.co  m
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);

    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umn.cs.spatialHadoop.operations.ClosestPairHadoop.java

License:Open Source License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing//from  w  w w.j  ava  2  s .c  o m
 * @param conf
 * @param fs
 * @param file
 * @return
 * @throws IOException 
 */
public static <S extends Shape> void cloesetPair(Path file, OperationsParams params) throws IOException {
    // Try to get file MBR from the MBRs of blocks
    JobConf job = new JobConf(params, ClosestPairHadoop.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(file.getName() + ".closest_pair_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));
    outFs.delete(outputPath, true);

    job.setJobName("ClosestPair");
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Point.class);

    job.setMapperClass(Map0.class);
    job.setReducerClass(Reduce0.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeArrayInputFormat.class);
    //      job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.setInputPaths(job, file);

    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);
    //////////////////////////////////////////////////////////////////////////

    System.out.println("Begin second round!");
    // 2nd Round
    job = new JobConf(params, ClosestPairHadoop.class);
    job.setJobName("Second Round");
    job.setOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Point.class);

    job.setMapperClass(Map1.class);
    job.setReducerClass(Reduce1.class);
    clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeArrayInputFormat.class);
    //      job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.setInputPaths(job, outputPath); // The previous output is the current input

    Path newPath = new Path(outputPath.getName() + "_result");
    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, newPath);

    JobClient.runJob(job);
}