Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:com.vsii.ttxvn.crawling.DeleteFailedDataJob.java

License:Apache License

public int run(String[] args) throws IOException {
    if (args.length < 1) {
        System.err.println("Usage: DeleteFailedDataJob <crawldb>");
        return 1;
    }/*from w  w  w.  j av a2 s .co  m*/

    String crawldb = args[0];

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("DeleteFailedDataJob: starting at " + sdf.format(start));

    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/dedup-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(getConf());

    job.setJobName("DeleteFailedData on " + crawldb);

    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(DBFilter.class);
    job.setReducerClass(DedupReducer.class);

    try {
        RunningJob rj = JobClient.runJob(job);
        Group g = rj.getCounters().getGroup("DeleteFailedDataJobStatus");
        if (g != null) {
            long dups = g.getCounter("Documents marked as duplicate");
            LOG.info("DeleteFailedData: " + (int) dups + " documents marked as duplicates");
        }
    } catch (final Exception e) {
        LOG.error("DeleteFailedDataJob: " + StringUtils.stringifyException(e));
        return -1;
    }

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
        LOG.info("DeleteFailedData: Updating status of duplicate urls into crawl db.");
    }

    Path dbPath = new Path(crawldb);
    JobConf mergeJob = CrawlDb.createJob(getConf(), dbPath);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(StatusUpdateReducer.class);

    try {
        JobClient.runJob(mergeJob);
    } catch (final Exception e) {
        LOG.error("DeleteFailedDataMergeJob: " + StringUtils.stringifyException(e));
        return -1;
    }

    CrawlDb.install(mergeJob, dbPath);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("DeleteFailedData finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));

    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts redirects and the target for each.
 *
 * @param inputPath/*from   w w  w .  j  a va  2  s .c o m*/
 * @param outputPath
 * @throws IOException
 */
private void task0(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting redirects (phase 0)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper0.class);
    conf.setReducerClass(IdentityReducer.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from Wikipedia article to (srcID, (targetID, anchor).
 *
 * @param inputPath/*from   w  w  w.j  av a2s .  c om*/
 * @param outputPath
 * @throws IOException
 */
private void task1(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    conf.setNumReduceTasks(10);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(PairOfStrings.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfStrings.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 *
 * Maps from (srcID, (targetID, anchor) to (targetID, (anchor, count)).
 *
 * @param inputPath/*from   w  w w  .  java 2 s .  c  om*/
 * @param outputPath
 * @throws IOException
 */
private void task2(String inputPath, String outputPath, String redirPath) throws IOException {
    LOG.info("Extracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);
    Random r = new Random();
    //String tmpOutput = "tmp-" + this.getClass().getCanonicalName() + "-" + r.nextInt(10000);
    //LOG.info( "intermediate folder for merge " + tmpOutput );

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    conf.setNumReduceTasks(1);

    try {
        DistributedCache.addCacheFile(new URI(redirPath + "/part-00000" + "#" + "redirs.dat"), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    //FileOutputFormat.setOutputPath(conf, new Path(tmpOutput));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper2.class);
    conf.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
    // Clean up intermediate data.
    FileSystem.get(conf).delete(new Path(inputPath), true);

    /*
    //merge
    String finalO = outputPath+"/part-00000/data";
    FileSystem.get(conf).mkdirs( new Path( outputPath + "part-00000") );
    getMergeInHdfs( tmpOutput, finalO, conf );
    FileSystem.get(conf).delete(new Path(tmpOutput), true);
    */
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts CF for each found anchor./*from ww  w  .jav a 2s.  c om*/
 *
 * @param inputPath
 * @param mapPath
 * @param outputPath
 * @throws IOException
 */
private void task3(String inputPath, String mapPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 3)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);
    LOG.info(" - mapping: " + mapPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);
    String location = "map.dat";

    try {
        DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf);
        //DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper3.class);
    conf.setCombinerClass(MyReducer3.class);
    conf.setReducerClass(MyReducer3.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)).
 *
 * @param inputPath//from ww  w  .  ja va 2  s  .com
 * @param outputPath
 * @throws IOException
 */
private void task4(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 4)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    //FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data"));
    FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data"));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(HMapSIW.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper4.class);
    conf.setReducerClass(MyReducer4.class);

    JobClient.runJob(conf);
}

From source file:com.zjy.mongo.util.MongoTool.java

License:Apache License

private int runMapredJob(final Configuration conf) {
    final JobConf job = new JobConf(conf, getClass());
    /**/*  ww  w . ja  v  a 2 s  .  c om*/
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends org.apache.hadoop.mapred.Mapper> mapper = MapredMongoConfigUtil.getMapper(conf);

    if (LOG.isDebugEnabled()) {
        LOG.debug("Mapper Class: " + mapper);
        LOG.debug("Input URI: " + conf.get(MapredMongoConfigUtil.INPUT_URI));
    }
    job.setMapperClass(mapper);
    Class<? extends org.apache.hadoop.mapred.Reducer> combiner = MapredMongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MapredMongoConfigUtil.getReducer(conf));

    job.setOutputFormat(MapredMongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MapredMongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MapredMongoConfigUtil.getOutputValue(conf));
    job.setInputFormat(MapredMongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MapredMongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MapredMongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MapredMongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MapredMongoConfigUtil.isJobBackground(conf);
    try {
        RunningJob runningJob = JobClient.runJob(job);
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            runningJob.waitForCompletion();
            return 0;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }

}

From source file:contrail.stages.GraphToFasta.java

License:Open Source License

@Override
public RunningJob runJob() throws Exception {
    String inputPath = (String) stage_options.get("inputpath");
    String outputPath = (String) stage_options.get("outputpath");

    sLogger.info(" - inputpath: " + inputPath);
    sLogger.info(" - outputpath: " + outputPath);

    JobConf conf = new JobConf(GraphToFasta.class);

    AvroJob.setInputSchema(conf, GraphNodeData.SCHEMA$);

    initializeJobConfiguration(conf);/* w ww  .  jav a  2s . co m*/

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    AvroInputFormat<GraphNodeData> input_format = new AvroInputFormat<GraphNodeData>();
    conf.setInputFormat(input_format.getClass());
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    // Make it mapper only.
    conf.setNumReduceTasks(0);
    conf.setMapperClass(GraphToFastqMapper.class);

    if (stage_options.containsKey("writeconfig")) {
        writeJobConfig(conf);
    } else {
        // Delete the output directory if it exists already
        Path out_path = new Path(outputPath);
        if (FileSystem.get(conf).exists(out_path)) {
            // TODO(jlewi): We should only delete an existing directory
            // if explicitly told to do so.
            sLogger.info("Deleting output path: " + out_path.toString() + " " + "because it already exists.");
            FileSystem.get(conf).delete(out_path, true);
        }

        long starttime = System.currentTimeMillis();
        RunningJob result = JobClient.runJob(conf);
        long endtime = System.currentTimeMillis();

        float diff = (float) ((endtime - starttime) / 1000.0);

        System.out.println("Runtime: " + diff + " s");
        return result;
    }
    return null;
}

From source file:Corrector.Correction.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: Correction [0/7]");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(Correction.class);
    conf.setJobName("Correction " + inputPath + " " + Config.K);

    Config.initializeConfiguration(conf);

    FileInputFormat.addInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    //conf.setBoolean("mapred.output.compress", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CorrectionMapper.class);
    conf.setReducerClass(CorrectionReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Corrector.FindError.java

License:Apache License

public RunningJob run(String inputPath, String outputPath, int idx, String hkmerlist) throws Exception {
    sLogger.info("Tool name: FindError");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(FindError.class);
    conf.setJobName("FindError " + inputPath + " " + Config.K);
    conf.setLong("IDX", idx);
    //\\/*w  w  w .  j a  va 2 s . c  om*/
    DistributedCache.addCacheFile(new URI(hkmerlist), conf);
    //\\

    Config.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(FindErrorMapper.class);
    conf.setReducerClass(FindErrorReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}