Example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass)

Source Link

Document

Set the key class for the job output data.

Usage

From source file:azkaban.jobtype.examples.java.WordCount.java

License:Apache License

public void run() throws Exception {
    logger.info(String.format("Starting %s", getClass().getSimpleName()));

    // hadoop conf should be on the classpath
    JobConf jobconf = getJobConf();
    jobconf.setJarByClass(WordCount.class);

    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(IntWritable.class);

    jobconf.setMapperClass(Map.class);
    jobconf.setReducerClass(Reduce.class);

    jobconf.setInputFormat(TextInputFormat.class);
    jobconf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.addInputPath(jobconf, new Path(inputPath));
    FileOutputFormat.setOutputPath(jobconf, new Path(outputPath));

    if (forceOutputOverrite) {
        FileSystem fs = FileOutputFormat.getOutputPath(jobconf).getFileSystem(jobconf);
        fs.delete(FileOutputFormat.getOutputPath(jobconf), true);
    }/* w  w w  .j  a  va 2s.  c om*/

    super.run();
}

From source file:babel.prep.corpus.CorpusGenerator.java

License:Apache License

/**
 * Configures a map-only dataset generation job.
 *///from  ww  w.j  a  va 2s.  c  om
protected JobConf createJobConf(String crawlDir, String pagesSubDir, boolean xmlOut) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("create " + (xmlOut ? "xml formatted" : "") + " dataset from " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CorpusGenMapper.class);
    job.setOutputFormat(xmlOut ? MultipleXMLLangFileOutputFormat.class : MultipleLangFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR),
            "corpus." + (xmlOut ? PARAM_XML + "." : "") + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:babel.prep.datedcorpus.DatedCorpusGenerator.java

License:Apache License

/**
 * Configures a map-only dataset generation job.
 *//*from  w  w w .j  av a2s.co  m*/
protected JobConf createJobConf(String crawlDir, String pagesSubDir) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("create dated dataset from " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(DatedCorpusGenMapper.class);
    job.setReducerClass(DatedCorpusGenReducer.class);

    job.setMapOutputValueClass(PageVersion.class);
    job.setOutputFormat(DatedLangFilesOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "datedcorpus." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:babel.prep.extract.NutchPageExtractor.java

License:Apache License

/**
 * Configures the extraction job.//from w  w  w . j  a v a  2s. co m
 */
protected JobConf createJobConf(String crawlDir) throws IOException {
    Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR);

    List<Path> segPaths = allSegmentDirs(segmentsPath);
    StringBuilder allSegNames = new StringBuilder();

    for (int i = 0; i < segPaths.size(); i++) {
        allSegNames.append(" " + segPaths.get(i).getName());
    }

    String timeStamp = getCurTimeStamp();

    JobConf job = new NutchJob(getConf());
    job.setJobName("read segments" + allSegNames.toString());

    // Specify what info to extract
    job.setBoolean("segment.reader.co", m_co);
    job.setBoolean("segment.reader.fe", m_fe);
    job.setBoolean("segment.reader.ge", m_ge);
    job.setBoolean("segment.reader.pa", m_pa);
    job.setBoolean("segment.reader.pd", m_pd);
    job.setBoolean("segment.reader.pt", m_pt);

    // Specify the paths to extract from for each segment
    for (int i = 0; i < segPaths.size(); i++) {
        if (m_ge)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME));
        if (m_fe)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME));
        if (m_pa)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME));
        if (m_co)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME));
        if (m_pd)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME));
        if (m_pt)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME));
    }

    // Specify the segments directory so that mapper can recover segment info
    job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName());
    // Store the start time/date of this job
    job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(PageExtMapper.class);
    job.setReducerClass(PageExtReducer.class);

    job.setMapOutputValueClass(NutchChunk.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp);
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:babel.prep.langid.LangIdentifier.java

License:Apache License

/**
 * Configures a map-only language id job.
 *///from  w  w w  .  j  a v  a 2 s.co  m
protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("identify languages for pages in " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(LangIdMapper.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langid." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    job.set(JOB_PROP_JOB_REFERRER, referrer);

    return job;
}

From source file:babel.prep.langidtime.LangAndTimeExtractor.java

License:Apache License

/**
 * Configures a map-only language id job.
 *//*from   www.  j  a  va2  s.c om*/
protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("identify languages and collect time for pages in " + pagesSubDir);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(LangAndTimeMapper.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    //ANNI EDIT
    job.setNumMapTasks(2);
    job.setNumReduceTasks(2);
    //END ANNI EDIT

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir));

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langidtime." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    job.set(JOB_PROP_JOB_REFERRER, referrer);

    return job;
}

From source file:babel.prep.merge.PageMerger.java

License:Apache License

/**
 * Configures a reduce-only page merge job.
 *//*w w  w.j  a  va  2 s. co m*/
protected JobConf createJobConf(String crawlDir, String pagesSubDirOne, String pagesSubDirTwo)
        throws IOException {
    JobConf job = new JobConf(getConf());
    job.setJobName("merge pages in " + pagesSubDirOne + " and " + pagesSubDirTwo);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setReducerClass(PageMergeReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDirOne));
    FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDirTwo));

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.merge." + getCurTimeStamp());
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:boa.datagen.SeqSort.java

License:Apache License

/**
 * The main driver for sort program.//from   w  w  w .  ja v  a2s .co  m
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
@Override
public int run(String[] args) throws Exception {
    System.out.println(inPath);

    JobConf jobConf = new JobConf(getConf(), SeqSort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(SequenceFileInputFormat.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BytesWritable.class);

    SequenceFileOutputFormat.setCompressOutput(jobConf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(jobConf, SnappyCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(jobConf, CompressionType.BLOCK);

    // Make sure there are exactly 2 parameters left.
    FileInputFormat.setInputPaths(jobConf, inPath);
    FileOutputFormat.setOutputPath(jobConf, new Path(outPath));

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:br.eti.kinoshita.hadoop.WordCount.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJarByClass(WordCount.class);
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //FileInputFormat.setInputPaths(conf, new Path("hdfs://chuva:9000/test/leiseca."));
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//  w ww.ja  v a2s .c o m
}

From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMain.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(IndexerMain.class);
    conf.setJobName("indexer");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IndexerMap.class);
    conf.setCombinerClass(IndexerReduce.class);
    conf.setReducerClass(IndexerReduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from  w w  w . j a  va2 s.c o  m
}