Example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the job output data.

Usage

From source file:azkaban.jobtype.examples.java.WordCount.java

License:Apache License

public void run() throws Exception {
    logger.info(String.format("Starting %s", getClass().getSimpleName()));

    // hadoop conf should be on the classpath
    JobConf jobconf = getJobConf();
    jobconf.setJarByClass(WordCount.class);

    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(IntWritable.class);

    jobconf.setMapperClass(Map.class);
    jobconf.setReducerClass(Reduce.class);

    jobconf.setInputFormat(TextInputFormat.class);
    jobconf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.addInputPath(jobconf, new Path(inputPath));
    FileOutputFormat.setOutputPath(jobconf, new Path(outputPath));

    if (forceOutputOverrite) {
        FileSystem fs = FileOutputFormat.getOutputPath(jobconf).getFileSystem(jobconf);
        fs.delete(FileOutputFormat.getOutputPath(jobconf), true);
    }/* w  w w  .j  a  va 2s.  c om*/

    super.run();
}

From source file:babel.prep.corpus.CorpusGenerator.java

License:Apache License

/**
 * Configures a map-only dataset generation job.
 *///from  ww  w.j  a  va 2s.  c  om
protected JobConf createJobConf(String crawlDir, String pagesSubDir, boolean xmlOut) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("create " + (xmlOut ? "xml formatted" : "") + " dataset from " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CorpusGenMapper.class);
    job.setOutputFormat(xmlOut ? MultipleXMLLangFileOutputFormat.class : MultipleLangFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR),
            "corpus." + (xmlOut ? PARAM_XML + "." : "") + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:babel.prep.datedcorpus.DatedCorpusGenerator.java

License:Apache License

/**
 * Configures a map-only dataset generation job.
 *//*from  w  w w .j  av a2s.co  m*/
protected JobConf createJobConf(String crawlDir, String pagesSubDir) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("create dated dataset from " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(DatedCorpusGenMapper.class);
    job.setReducerClass(DatedCorpusGenReducer.class);

    job.setMapOutputValueClass(PageVersion.class);
    job.setOutputFormat(DatedLangFilesOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "datedcorpus." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:babel.prep.extract.NutchPageExtractor.java

License:Apache License

/**
 * Configures the extraction job.//from w  w  w . j  a v a  2s. co m
 */
protected JobConf createJobConf(String crawlDir) throws IOException {
    Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR);

    List<Path> segPaths = allSegmentDirs(segmentsPath);
    StringBuilder allSegNames = new StringBuilder();

    for (int i = 0; i < segPaths.size(); i++) {
        allSegNames.append(" " + segPaths.get(i).getName());
    }

    String timeStamp = getCurTimeStamp();

    JobConf job = new NutchJob(getConf());
    job.setJobName("read segments" + allSegNames.toString());

    // Specify what info to extract
    job.setBoolean("segment.reader.co", m_co);
    job.setBoolean("segment.reader.fe", m_fe);
    job.setBoolean("segment.reader.ge", m_ge);
    job.setBoolean("segment.reader.pa", m_pa);
    job.setBoolean("segment.reader.pd", m_pd);
    job.setBoolean("segment.reader.pt", m_pt);

    // Specify the paths to extract from for each segment
    for (int i = 0; i < segPaths.size(); i++) {
        if (m_ge)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME));
        if (m_fe)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME));
        if (m_pa)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME));
        if (m_co)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME));
        if (m_pd)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME));
        if (m_pt)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME));
    }

    // Specify the segments directory so that mapper can recover segment info
    job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName());
    // Store the start time/date of this job
    job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(PageExtMapper.class);
    job.setReducerClass(PageExtReducer.class);

    job.setMapOutputValueClass(NutchChunk.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp);
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:babel.prep.langid.LangIdentifier.java

License:Apache License

/**
 * Configures a map-only language id job.
 *///from  w  w w  .  j  a v  a 2 s.co  m
protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("identify languages for pages in " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(LangIdMapper.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langid." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    job.set(JOB_PROP_JOB_REFERRER, referrer);

    return job;
}

From source file:babel.prep.langidtime.LangAndTimeExtractor.java

License:Apache License

/**
 * Configures a map-only language id job.
 *//*from   www.  j  a  va2  s.c om*/
protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("identify languages and collect time for pages in " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(LangAndTimeMapper.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    //ANNI EDIT
    job.setNumMapTasks(2);
    job.setNumReduceTasks(2);
    //END ANNI EDIT

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langidtime." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    job.set(JOB_PROP_JOB_REFERRER, referrer);

    return job;
}

From source file:babel.prep.merge.PageMerger.java

License:Apache License

/**
 * Configures a reduce-only page merge job.
 *//*w w  w.j  a  va  2 s. co m*/
protected JobConf createJobConf(String crawlDir, String pagesSubDirOne, String pagesSubDirTwo)
        throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("merge pages in " + pagesSubDirOne + " and " + pagesSubDirTwo);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setReducerClass(PageMergeReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDirOne));
    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDirTwo));

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.merge." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:boa.datagen.SeqSort.java

License:Apache License

/**
 * The main driver for sort program.//from   w  w  w .  ja v  a2s .co  m
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
@Override
public int run(String[] args) throws Exception {
    System.out.println(inPath);

    JobConf jobConf = new JobConf(getConf(), SeqSort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(SequenceFileInputFormat.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BytesWritable.class);

    SequenceFileOutputFormat.setCompressOutput(jobConf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(jobConf, SnappyCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(jobConf, CompressionType.BLOCK);

    // Make sure there are exactly 2 parameters left.
    FileInputFormat.setInputPaths(jobConf, inPath);
    FileOutputFormat.setOutputPath(jobConf, new Path(outPath));

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:br.eti.kinoshita.hadoop.WordCount.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJarByClass(WordCount.class);
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //FileInputFormat.setInputPaths(conf, new Path("hdfs://chuva:9000/test/leiseca."));
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//  w ww.ja  v a2s .c o m
}

From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMain.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(IndexerMain.class);
    conf.setJobName("indexer");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IndexerMap.class);
    conf.setCombinerClass(IndexerReduce.class);
    conf.setReducerClass(IndexerReduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from  w w  w . j a  va2 s.c o  m
}