Example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:babel.prep.datedcorpus.DatedCorpusGenerator.java

License:Apache License

/**
 * Configures a map-only dataset generation job.
 */// w ww.j  a  v a 2 s.co  m
protected JobConf createJobConf(String crawlDir, String pagesSubDir) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("create dated dataset from " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(DatedCorpusGenMapper.class);
    job.setReducerClass(DatedCorpusGenReducer.class);

    job.setMapOutputValueClass(PageVersion.class);
    job.setOutputFormat(DatedLangFilesOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "datedcorpus." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:babel.prep.extract.NutchPageExtractor.java

License:Apache License

/**
 * Configures the extraction job.//from  w  w w .  j  a v  a2s. c  o  m
 */
protected JobConf createJobConf(String crawlDir) throws IOException {
    Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR);

    List<Path> segPaths = allSegmentDirs(segmentsPath);
    StringBuilder allSegNames = new StringBuilder();

    for (int i = 0; i < segPaths.size(); i++) {
        allSegNames.append(" " + segPaths.get(i).getName());
    }

    String timeStamp = getCurTimeStamp();

    JobConf job = new NutchJob(getConf());
    job.setJobName("read segments" + allSegNames.toString());

    // Specify what info to extract
    job.setBoolean("segment.reader.co", m_co);
    job.setBoolean("segment.reader.fe", m_fe);
    job.setBoolean("segment.reader.ge", m_ge);
    job.setBoolean("segment.reader.pa", m_pa);
    job.setBoolean("segment.reader.pd", m_pd);
    job.setBoolean("segment.reader.pt", m_pt);

    // Specify the paths to extract from for each segment
    for (int i = 0; i < segPaths.size(); i++) {
        if (m_ge)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME));
        if (m_fe)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME));
        if (m_pa)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME));
        if (m_co)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME));
        if (m_pd)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME));
        if (m_pt)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME));
    }

    // Specify the segments directory so that mapper can recover segment info
    job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName());
    // Store the start time/date of this job
    job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(PageExtMapper.class);
    job.setReducerClass(PageExtReducer.class);

    job.setMapOutputValueClass(NutchChunk.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp);
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:Brush.AdjustMateEdge.java

License:Apache License

public RunningJob run(String inputPath, String outputPath, long reads, long ctg_sum) throws Exception {
    sLogger.info("Tool name: AdjustMateEdge");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    //JobConf conf = new JobConf(Stats.class);
    JobConf conf = new JobConf(AdjustMateEdge.class);
    conf.setJobName("AdjustMateEdge " + inputPath);

    conf.setLong("READS", reads);
    conf.setLong("CTG_SUM", ctg_sum);
    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(AdjustMateEdgeMapper.class);
    conf.setReducerClass(AdjustMateEdgeReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.Compressible.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: Compressible");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    //JobConf conf = new JobConf(Stats.class);
    JobConf conf = new JobConf(Compressible.class);
    conf.setJobName("Compressible " + inputPath);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CompressibleMapper.class);
    conf.setReducerClass(CompressibleReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.CountBraid.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: CountBraid");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(CountBraid.class);
    conf.setJobName("CountBraid " + inputPath + " " + BrushConfig.K);
    //conf.setFloat("Error_Rate", ErrorRate);
    //conf.setFloat("Exp_Cov", Exp_Cov);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CountBraidMapper.class);
    conf.setReducerClass(CountBraidReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.CountKmer.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: CountKmer");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(CountKmer.class);
    conf.setJobName("CountKmer " + inputPath + " " + BrushConfig.K);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    //conf.setMapOutputValueClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    //conf.setBoolean("mapred.output.compress", true);

    conf.setMapperClass(CountKmerMapper.class);
    conf.setReducerClass(CountKmerReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.CountReads.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: CountReads");
    sLogger.info(" - input: " + inputPath);

    JobConf conf = new JobConf(CountReads.class);
    conf.setJobName("CountReads " + inputPath + " " + BrushConfig.K);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CountReadsMapper.class);
    //conf.setReducerClass(CountReadsReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.CutChimericLinks.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: CutChimericLinks");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(CutChimericLinks.class);
    conf.setJobName("CutChimericLinks " + inputPath + " " + BrushConfig.K);
    //conf.setFloat("Error_Rate", ErrorRate);
    //conf.setFloat("Exp_Cov", Exp_Cov);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CutChimericLinksMapper.class);
    conf.setReducerClass(CutChimericLinksReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.CutRepeatBoundary.java

License:Apache License

public RunningJob run(String inputPath, String outputPath, long reads, long ctg_sum) throws Exception {
    sLogger.info("Tool name: CutRepeatBoundary");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(CutRepeatBoundary.class);
    conf.setJobName("CutRepeatBoundary " + inputPath + " " + BrushConfig.K);
    //conf.setFloat("Error_Rate", ErrorRate);
    //conf.setFloat("Exp_Cov", Exp_Cov);
    conf.setLong("READS", reads);
    conf.setLong("CTG_SUM", ctg_sum);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CutRepeatBoundaryMapper.class);
    conf.setReducerClass(CutRepeatBoundaryReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.DefineConsensus.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: DefineConsensus");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(DefineConsensus.class);
    conf.setJobName("DefineConsensus " + inputPath + " " + BrushConfig.K);
    //conf.setFloat("Error_Rate", ErrorRate);
    //conf.setFloat("Exp_Cov", Exp_Cov);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(DefineConsensusMapper.class);
    conf.setReducerClass(DefineConsensusReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}