Example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the job output data.

Usage

From source file:com.pinterest.hdfsbackup.distcp.DistCp.java

License:Apache License

private static JobConf createJobConf(Configuration conf) {
    JobConf jobconf = new JobConf(conf, DistCp.class);
    jobconf.setJobName(NAME);//from w  w w. j a va2 s. c  o m

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobconf.setMapSpeculativeExecution(false);

    jobconf.setInputFormat(CopyInputFormat.class);
    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(Text.class);

    jobconf.setMapperClass(CopyFilesMapper.class);
    jobconf.setNumReduceTasks(0);
    return jobconf;
}

From source file:com.qfa.WordCount.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker./*from w w  w . j a  v  a 2 s  . c o m*/
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCount.class);
    conf.setJobName("wordcount");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:com.rapleaf.hank.hadoop.HadoopDomainBuilder.java

License:Apache License

public static final JobConf createJobConfiguration(String inputPath,
        Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass, int versionNumber,
        DomainBuilderProperties properties) {
    JobConf conf = new JobConf();
    // Input specification
    conf.setInputFormat(inputFormatClass);
    FileInputFormat.setInputPaths(conf, inputPath);
    // Mapper class and key/value classes
    conf.setMapperClass(mapperClass);//from ww w.  j a  v a2  s .  c  o  m
    conf.setMapOutputKeyClass(KeyAndPartitionWritableComparable.class);
    conf.setMapOutputValueClass(ValueWritable.class);
    // Reducer class and key/value classes
    conf.setReducerClass(DomainBuilderReducer.class);
    conf.setOutputKeyClass(KeyAndPartitionWritable.class);
    conf.setOutputValueClass(ValueWritable.class);
    // Output format
    conf.setOutputFormat(properties.getOutputFormatClass());
    // Output path (set to tmp output path)
    FileOutputFormat.setOutputPath(conf, new Path(properties.getTmpOutputPath(versionNumber)));
    // Partitioner
    conf.setPartitionerClass(DomainBuilderPartitioner.class);
    // Output Committer
    conf.setOutputCommitter(DomainBuilderOutputCommitter.class);
    // Hank specific configuration
    properties.setJobConfProperties(conf, versionNumber);
    return conf;
}

From source file:com.scaleoutsoftware.soss.hserver.Test_MapToMapCopyMapred.java

License:Apache License

public int run(String[] args) throws Exception {
    final NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap("mapr-i",
            new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class));
    final NamedMap<IntWritable, Text> outputMap = NamedMapFactory.getMap("mapr-o",
            new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class));
    inputMap.clear();//from w w  w  .  jav  a  2  s. c  om
    outputMap.clear();
    Thread.sleep(15000);
    BulkLoader<IntWritable, Text> put = inputMap.getBulkLoader();
    String content = "xcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
    Text contentW = new Text(content);
    IntWritable count = new IntWritable();
    int expectedSize = 10000;

    for (int i = 0; i < expectedSize; i++) {
        count.set(i);
        put.put(count, contentW);
    }
    put.close();
    InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis())
            .addClass(Test_MapToMapCopyMapred.class).load();

    JobConf configuration = new JobConf(getConf(), Test_MapToMapCopyMapred.class);
    configuration.setInt("mapred.hserver.setting.reducer.usememorymappedfiles", 0);
    configuration.setMapOutputKeyClass(IntWritable.class);
    configuration.setMapOutputValueClass(Text.class);
    configuration.setOutputKeyClass(IntWritable.class);
    configuration.setOutputValueClass(Text.class);
    configuration.setInputFormat(NamedMapInputFormatMapred.class);
    configuration.setOutputFormat(NamedMapOutputFormatMapred.class);
    NamedMapInputFormatMapred.setNamedMap(configuration, inputMap);
    NamedMapOutputFormatMapred.setNamedMap(configuration, outputMap);
    assertEquals(inputMap.size(), outputMap.size() + expectedSize); // should be 0 + expected
    HServerJobClient.runJob(configuration, false, grid);
    assertEquals(inputMap.size(), outputMap.size());
    inputMap.clear();
    outputMap.clear();
    grid.unload();
    return 1;
}

From source file:com.scaleoutsoftware.soss.hserver.Test_WordCountMapred.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker./*from  w w w.  j  a v  a2 s .  c om*/
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), Test_WordCountMapred.class);
    conf.setJobName("wordcount");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    conf.setNumReduceTasks(0);

    String in = args.length == 2 ? args[0] : "random.txt";
    String out = args.length == 2 ? args[1]
            : "c:\\development\\mapred_output\\dir" + System.currentTimeMillis();

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis())
            .addJar("/path/to/your/jar").load();

    // HERE IS STANDARD HADOOP INVOCATION
    //JobClient.runJob(conf);

    // HSERVER INVOCATION
    HServerJobClient.runJob(conf, false, grid);
    return 0;
}

From source file:com.spotify.hdfs2cass.BulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLine cmdLine = parseOptions(args);

    String[] inputPaths = cmdLine.getOptionValues('i');
    String seedNodeHost = cmdLine.getOptionValue('h');
    String seedNodePort = cmdLine.getOptionValue('p', "9160");
    String keyspace = cmdLine.getOptionValue('k');
    String colfamily = cmdLine.getOptionValue('c');
    int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0"));
    Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0"));
    String poolName = cmdLine.getOptionValue("pool");

    ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort);
    clusterInfo.init(keyspace);//  ww w .  j  ava  2 s. c o  m

    final String partitionerClass = clusterInfo.getPartitionerClass();
    final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")),
            clusterInfo.getNumClusterNodes());

    Configuration conf = new Configuration();
    ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily);
    ConfigHelper.setOutputInitialAddress(conf, seedNodeHost);
    ConfigHelper.setOutputRpcPort(conf, seedNodePort);
    ConfigHelper.setOutputPartitioner(conf, partitionerClass);

    if (cmdLine.hasOption('s')) {
        conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32"));
    }

    if (cmdLine.hasOption('M')) {
        conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M'));
    }

    if (cmdLine.hasOption('C')) {
        ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C'));
    }

    if (cmdLine.hasOption('b')) {
        conf.setBoolean("com.spotify.hdfs2cass.base64", true);
    }

    JobConf job = new JobConf(conf);

    if (mappers > 0)
        job.setNumMapTasks(mappers);
    if (reducers > 0)
        job.setNumReduceTasks(reducers);
    if (copiers > 0)
        job.set("mapred.reduce.parallel.copies", copiers.toString());

    if (poolName != null)
        job.set("mapred.fairscheduler.pool", poolName);

    // set the nodes as a param for the other hadoop nodes
    clusterInfo.setConf(job);

    String jobName = "bulkloader-hdfs-to-cassandra";
    if (cmdLine.hasOption('n'))
        jobName += "-" + cmdLine.getOptionValue('n');
    job.setJobName(jobName);
    job.setJarByClass(BulkLoader.class);

    job.setInputFormat(AvroAsTextInputFormat.class);

    for (String inputPath : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(inputPath));
    }

    //map just outputs text, reduce sends to cassandra
    job.setMapperClass(MapToText.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(CassandraPartitioner.class);

    job.setReducerClass(ReduceTextToCassandra.class);
    job.setOutputKeyClass(ByteBuffer.class);
    job.setOutputValueClass(List.class);

    if (cmdLine.hasOption('s'))
        job.setOutputFormat(BulkOutputFormat.class);
    else
        job.setOutputFormat(ColumnFamilyOutputFormat.class);

    JobClient.runJob(job);
    return 0;
}

From source file:com.talis.mapreduce.wordcount.oldapi.WordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*  w  w  w.j  a v a 2s.c  o  m*/

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Word Count");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setMapperClass(WordCountMapper.class);
    conf.setCombinerClass(WordCountReducer.class);
    conf.setReducerClass(WordCountReducer.class);

    //      conf.setPartitionerClass(HashPartitioner.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:com.TCG.Nutch_DNS.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*  w w w .  j  av a2 s. c om*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.TCG.Nutch_DNS.HostDb.java

License:Apache License

public static JobConf createJob(Configuration config, Path crawlDb) throws IOException {
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);

    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
        FileInputFormat.addInputPath(job, current);
    }//w  w w . j  av a2  s .c  om
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(HostDbFilter.class);
    job.setReducerClass(HostDbReducer.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    // https://issues.apache.org/jira/browse/NUTCH-1110
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    return job;
}

From source file:com.TCG.Nutch_DNS.Injector.java

License:Apache License

public void inject(Path hostDb, Path crawlDb) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: starting at " + sdf.format(start));
        LOG.info("Injector: hostDb: " + hostDb);
        LOG.info("Injector: carwlDb: " + crawlDb);
    }/*from w  ww . j av a  2s  .  com*/

    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: Converting injected host to host db entries.");
    }

    FileSystem fs = FileSystem.get(getConf());
    // determine if the crawldb already exists
    boolean dbExists = fs.exists(hostDb);

    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + hostDb);
    FileInputFormat.addInputPath(sortJob, crawlDb);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    if (dbExists) {
        sortJob.setOutputFormat(SequenceFileOutputFormat.class);
        //HostReducer,host
        sortJob.setReducerClass(ExitHostReducer.class);
    } else {
        sortJob.setOutputFormat(MapFileOutputFormat.class);

        //HostReducer,host
        sortJob.setReducerClass(NotExitHostReducer.class);

        sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
    }
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());

    RunningJob mapJob = null;
    try {
        mapJob = JobClient.runJob(sortJob);
    } catch (IOException e) {
        fs.delete(tempDir, true);
        throw e;
    }

    if (dbExists) {

        // merge with existing host db
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Merging injected hostDb into old hostDb.");
        }
        JobConf mergeJob = HostDb.createJob(getConf(), hostDb);
        FileInputFormat.addInputPath(mergeJob, tempDir);
        //HostDb.createJobReducer:HostDbReducer
        mergeJob.setReducerClass(InjectReducer.class);
        try {
            RunningJob merge = JobClient.runJob(mergeJob);
        } catch (IOException e) {
            fs.delete(tempDir, true);
            throw e;
        }
        HostDb.install(mergeJob, hostDb);
    } else {
        HostDb.install(sortJob, hostDb);
    }

    // clean up
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}