Example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.scaleoutsoftware.soss.hserver.Test_MapToMapCopyMapred.java

License:Apache License

public int run(String[] args) throws Exception {
    final NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap("mapr-i",
            new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class));
    final NamedMap<IntWritable, Text> outputMap = NamedMapFactory.getMap("mapr-o",
            new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class));
    inputMap.clear();/*from   w ww  . j  a  v a  2 s  .  c o  m*/
    outputMap.clear();
    Thread.sleep(15000);
    BulkLoader<IntWritable, Text> put = inputMap.getBulkLoader();
    String content = "xcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
    Text contentW = new Text(content);
    IntWritable count = new IntWritable();
    int expectedSize = 10000;

    for (int i = 0; i < expectedSize; i++) {
        count.set(i);
        put.put(count, contentW);
    }
    put.close();
    InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis())
            .addClass(Test_MapToMapCopyMapred.class).load();

    JobConf configuration = new JobConf(getConf(), Test_MapToMapCopyMapred.class);
    configuration.setInt("mapred.hserver.setting.reducer.usememorymappedfiles", 0);
    configuration.setMapOutputKeyClass(IntWritable.class);
    configuration.setMapOutputValueClass(Text.class);
    configuration.setOutputKeyClass(IntWritable.class);
    configuration.setOutputValueClass(Text.class);
    configuration.setInputFormat(NamedMapInputFormatMapred.class);
    configuration.setOutputFormat(NamedMapOutputFormatMapred.class);
    NamedMapInputFormatMapred.setNamedMap(configuration, inputMap);
    NamedMapOutputFormatMapred.setNamedMap(configuration, outputMap);
    assertEquals(inputMap.size(), outputMap.size() + expectedSize); // should be 0 + expected
    HServerJobClient.runJob(configuration, false, grid);
    assertEquals(inputMap.size(), outputMap.size());
    inputMap.clear();
    outputMap.clear();
    grid.unload();
    return 1;
}

From source file:com.spotify.hdfs2cass.BulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLine cmdLine = parseOptions(args);

    String[] inputPaths = cmdLine.getOptionValues('i');
    String seedNodeHost = cmdLine.getOptionValue('h');
    String seedNodePort = cmdLine.getOptionValue('p', "9160");
    String keyspace = cmdLine.getOptionValue('k');
    String colfamily = cmdLine.getOptionValue('c');
    int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0"));
    Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0"));
    String poolName = cmdLine.getOptionValue("pool");

    ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort);
    clusterInfo.init(keyspace);/* w ww  .j  a  va 2s .com*/

    final String partitionerClass = clusterInfo.getPartitionerClass();
    final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")),
            clusterInfo.getNumClusterNodes());

    Configuration conf = new Configuration();
    ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily);
    ConfigHelper.setOutputInitialAddress(conf, seedNodeHost);
    ConfigHelper.setOutputRpcPort(conf, seedNodePort);
    ConfigHelper.setOutputPartitioner(conf, partitionerClass);

    if (cmdLine.hasOption('s')) {
        conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32"));
    }

    if (cmdLine.hasOption('M')) {
        conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M'));
    }

    if (cmdLine.hasOption('C')) {
        ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C'));
    }

    if (cmdLine.hasOption('b')) {
        conf.setBoolean("com.spotify.hdfs2cass.base64", true);
    }

    JobConf job = new JobConf(conf);

    if (mappers > 0)
        job.setNumMapTasks(mappers);
    if (reducers > 0)
        job.setNumReduceTasks(reducers);
    if (copiers > 0)
        job.set("mapred.reduce.parallel.copies", copiers.toString());

    if (poolName != null)
        job.set("mapred.fairscheduler.pool", poolName);

    // set the nodes as a param for the other hadoop nodes
    clusterInfo.setConf(job);

    String jobName = "bulkloader-hdfs-to-cassandra";
    if (cmdLine.hasOption('n'))
        jobName += "-" + cmdLine.getOptionValue('n');
    job.setJobName(jobName);
    job.setJarByClass(BulkLoader.class);

    job.setInputFormat(AvroAsTextInputFormat.class);

    for (String inputPath : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(inputPath));
    }

    //map just outputs text, reduce sends to cassandra
    job.setMapperClass(MapToText.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(CassandraPartitioner.class);

    job.setReducerClass(ReduceTextToCassandra.class);
    job.setOutputKeyClass(ByteBuffer.class);
    job.setOutputValueClass(List.class);

    if (cmdLine.hasOption('s'))
        job.setOutputFormat(BulkOutputFormat.class);
    else
        job.setOutputFormat(ColumnFamilyOutputFormat.class);

    JobClient.runJob(job);
    return 0;
}

From source file:com.vsii.ttxvn.crawling.DeleteFailedDataJob.java

License:Apache License

public int run(String[] args) throws IOException {
    if (args.length < 1) {
        System.err.println("Usage: DeleteFailedDataJob <crawldb>");
        return 1;
    }//from   ww w . j  a va  2  s.  co m

    String crawldb = args[0];

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("DeleteFailedDataJob: starting at " + sdf.format(start));

    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/dedup-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(getConf());

    job.setJobName("DeleteFailedData on " + crawldb);

    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(DBFilter.class);
    job.setReducerClass(DedupReducer.class);

    try {
        RunningJob rj = JobClient.runJob(job);
        Group g = rj.getCounters().getGroup("DeleteFailedDataJobStatus");
        if (g != null) {
            long dups = g.getCounter("Documents marked as duplicate");
            LOG.info("DeleteFailedData: " + (int) dups + " documents marked as duplicates");
        }
    } catch (final Exception e) {
        LOG.error("DeleteFailedDataJob: " + StringUtils.stringifyException(e));
        return -1;
    }

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
        LOG.info("DeleteFailedData: Updating status of duplicate urls into crawl db.");
    }

    Path dbPath = new Path(crawldb);
    JobConf mergeJob = CrawlDb.createJob(getConf(), dbPath);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(StatusUpdateReducer.class);

    try {
        JobClient.runJob(mergeJob);
    } catch (final Exception e) {
        LOG.error("DeleteFailedDataMergeJob: " + StringUtils.stringifyException(e));
        return -1;
    }

    CrawlDb.install(mergeJob, dbPath);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("DeleteFailedData finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));

    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts redirects and the target for each.
 *
 * @param inputPath/*from ww  w .  j  a  v  a 2 s . c om*/
 * @param outputPath
 * @throws IOException
 */
private void task0(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting redirects (phase 0)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper0.class);
    conf.setReducerClass(IdentityReducer.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from Wikipedia article to (srcID, (targetID, anchor).
 *
 * @param inputPath//from  w w  w.  j  a va  2  s .com
 * @param outputPath
 * @throws IOException
 */
private void task1(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    conf.setNumReduceTasks(10);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(PairOfStrings.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfStrings.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 *
 * Maps from (srcID, (targetID, anchor) to (targetID, (anchor, count)).
 *
 * @param inputPath//w  w  w.  j av a  2s . c o m
 * @param outputPath
 * @throws IOException
 */
private void task2(String inputPath, String outputPath, String redirPath) throws IOException {
    LOG.info("Extracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);
    Random r = new Random();
    //String tmpOutput = "tmp-" + this.getClass().getCanonicalName() + "-" + r.nextInt(10000);
    //LOG.info( "intermediate folder for merge " + tmpOutput );

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    conf.setNumReduceTasks(1);

    try {
        DistributedCache.addCacheFile(new URI(redirPath + "/part-00000" + "#" + "redirs.dat"), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    //FileOutputFormat.setOutputPath(conf, new Path(tmpOutput));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper2.class);
    conf.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
    // Clean up intermediate data.
    FileSystem.get(conf).delete(new Path(inputPath), true);

    /*
    //merge
    String finalO = outputPath+"/part-00000/data";
    FileSystem.get(conf).mkdirs( new Path( outputPath + "part-00000") );
    getMergeInHdfs( tmpOutput, finalO, conf );
    FileSystem.get(conf).delete(new Path(tmpOutput), true);
    */
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts CF for each found anchor./*from w ww  .j  av a 2 s  .c  om*/
 *
 * @param inputPath
 * @param mapPath
 * @param outputPath
 * @throws IOException
 */
private void task3(String inputPath, String mapPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 3)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);
    LOG.info(" - mapping: " + mapPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);
    String location = "map.dat";

    try {
        DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf);
        //DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper3.class);
    conf.setCombinerClass(MyReducer3.class);
    conf.setReducerClass(MyReducer3.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)).
 *
 * @param inputPath/*from w  w  w . j a v a2s  .  c om*/
 * @param outputPath
 * @throws IOException
 */
private void task4(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 4)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    //FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data"));
    FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data"));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(HMapSIW.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper4.class);
    conf.setReducerClass(MyReducer4.class);

    JobClient.runJob(conf);
}

From source file:com.zjy.mongo.util.MongoTool.java

License:Apache License

private int runMapredJob(final Configuration conf) {
    final JobConf job = new JobConf(conf, getClass());
    /**//ww  w.  j av a2  s. c  om
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends org.apache.hadoop.mapred.Mapper> mapper = MapredMongoConfigUtil.getMapper(conf);

    if (LOG.isDebugEnabled()) {
        LOG.debug("Mapper Class: " + mapper);
        LOG.debug("Input URI: " + conf.get(MapredMongoConfigUtil.INPUT_URI));
    }
    job.setMapperClass(mapper);
    Class<? extends org.apache.hadoop.mapred.Reducer> combiner = MapredMongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MapredMongoConfigUtil.getReducer(conf));

    job.setOutputFormat(MapredMongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MapredMongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MapredMongoConfigUtil.getOutputValue(conf));
    job.setInputFormat(MapredMongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MapredMongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MapredMongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MapredMongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MapredMongoConfigUtil.isJobBackground(conf);
    try {
        RunningJob runningJob = JobClient.runJob(job);
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            runningJob.waitForCompletion();
            return 0;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }

}

From source file:contrail.stages.GraphToFasta.java

License:Open Source License

@Override
public RunningJob runJob() throws Exception {
    String inputPath = (String) stage_options.get("inputpath");
    String outputPath = (String) stage_options.get("outputpath");

    sLogger.info(" - inputpath: " + inputPath);
    sLogger.info(" - outputpath: " + outputPath);

    JobConf conf = new JobConf(GraphToFasta.class);

    AvroJob.setInputSchema(conf, GraphNodeData.SCHEMA$);

    initializeJobConfiguration(conf);/*  w  w  w .j a v  a  2  s  . co m*/

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    AvroInputFormat<GraphNodeData> input_format = new AvroInputFormat<GraphNodeData>();
    conf.setInputFormat(input_format.getClass());
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    // Make it mapper only.
    conf.setNumReduceTasks(0);
    conf.setMapperClass(GraphToFastqMapper.class);

    if (stage_options.containsKey("writeconfig")) {
        writeJobConfig(conf);
    } else {
        // Delete the output directory if it exists already
        Path out_path = new Path(outputPath);
        if (FileSystem.get(conf).exists(out_path)) {
            // TODO(jlewi): We should only delete an existing directory
            // if explicitly told to do so.
            sLogger.info("Deleting output path: " + out_path.toString() + " " + "because it already exists.");
            FileSystem.get(conf).delete(out_path, true);
        }

        long starttime = System.currentTimeMillis();
        RunningJob result = JobClient.runJob(conf);
        long endtime = System.currentTimeMillis();

        float diff = (float) ((endtime - starttime) / 1000.0);

        System.out.println("Runtime: " + diff + " s");
        return result;
    }
    return null;
}