Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java

License:Open Source License

/**
 * main routine/*from  ww w  .  j a  v a2  s  . co m*/
 * 
 * @param args
 */
public static void main(String[] args) {

    // amazon access key - passed on command line
    String accessKey = args[0];
    // amazon secret key - passed on command line
    String secretKey = args[1];
    // regular expression to match against - passed in command line
    String regEx = args[2];
    // group number to extract
    int groupNumber = Integer.parseInt(args[3]);

    /** arc files names start with year then month **/
    // we want to process all files uploaded in 2009
    // so, we will use the prefix string "2009",
    // buy you could, for example pass in a more restrictive
    // pattern such as "2008/06".

    String inputPrefix = "2009";

    LOG.info("Processing Path:" + inputPrefix);

    // allocate job config
    JobConf job = new JobConf(SampleHadoopJob.class);
    // set job name
    job.setJobName("Sample RegEx Job against path:" + inputPrefix);
    // set regular expression attributes
    job.set("mapred.mapper.regex", regEx);
    job.setInt("mapred.mapper.regex.group", groupNumber);

    // create temp file pth
    Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis());

    LOG.info("Output for job " + job.getJobName() + " is:" + tempDir);

    // we are going to be using the JetS3ARCSource as an input source to
    // the ArcInputFormat. This input source uses the multi-threaded jets3
    // library to request data from S3.

    /** setup s3 properties **/

    // set the number of retries per ARC file.
    // we are setting this number to one, so if an IOException
    // occurs when processing an ARCFile, we are going to silently skip it
    // and continue processing the next ARC file. You should set this to be
    // a number LESS than mapred.max.tracker.failures (as defined in your
    // job config or hadoop-site.xml). Otherwise, your entire job could
    // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce
    // exhibits a failure condition specific to a single key or set of keys.
    JetS3tARCSource.setMaxRetries(job, 1);

    // set up S3 credentials ...
    JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
    JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);

    // set the number of files per split
    // set this number higher if the bucket contains lots of files, to reduce
    // the burden on the map-reduce system from tracking too many file splits.
    ARCSplitCalculator.setFilesPerSplit(job, 25);

    /** set up arc reader properties **/

    // again, set the timeout to something reasonable, so that your entire job
    // will not hang if a single GET request fails to complete in a reasonable
    // amount of time
    ArcFileReader.setIOTimeoutValue(30000);
    // set input prefixes ...
    JetS3tARCSource.setInputPrefixes(job, inputPrefix);
    // and S3 bucket name ...
    JetS3tARCSource.setBucketName(job, "commoncrawl");
    // and setup arc source for ArcInputFormat
    ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);

    // now inform the job that it needs to use the ARCInputFormat
    job.setInputFormat(ARCInputFormat.class);

    // set up our map runner class
    // we use a map runner instead of a mapper here to give us an extra level of
    // control over how we handle errors. When running a large job against
    // the crawl corpus which may contain hunders of thousands of ARC files, it
    // is extremely important to reduce the risks of abnormal job termination.
    job.setMapRunnerClass(SampleHadoopJob.class);

    // setup reducer (identity in this case ... )
    job.setReducerClass(IdentityReducer.class);
    // standard output format ...
    job.setOutputFormat(SequenceFileOutputFormat.class);
    // set output path
    job.setOutputPath(tempDir);
    // map output types
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // run the job ...
    try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
    }
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {/*from www.  j av  a  2  s.  c om*/
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.gbif.ocurrence.index.solr.ConfTester.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), ConfTester.class);
    job.setNumMapTasks(numMapper);/*from  ww  w  . ja va2 s . c o  m*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(ConfTester.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(ConfTester.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(ConfTester.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopV2JobSelfTest.java

License:Open Source License

/**
 * Tests that {@link GridHadoopJob} provides wrapped serializer if it's set in configuration.
 *
 * @throws GridException If fails./*from  ww w  . j a v  a  2  s. c  o m*/
 */
public void testCustomSerializationApplying() throws GridException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    GridHadoopJob job = new GridHadoopV2Job(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log);

    GridHadoopTaskContext taskCtx = job
            .getTaskContext(new GridHadoopTaskInfo(GridHadoopTaskType.MAP, null, 0, 0, null));

    GridHadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.hxx.hadoop.GeneratorHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter,
        boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment=" + segment);

    JobConf job = new NutchJob(getConf());
    // job.setJarByClass(GeneratorHbase.class);
    job.setJobName("generate:" + table + " "
            + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment);
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (reduceCnt == -1) {
        reduceCnt = job.getNumMapTasks(); // a partition per fetch task
    }/*from  w w w  .  ja va 2  s  .com*/
    if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) {
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        reduceCnt = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCECNT, reduceCnt);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(CodeInputFormat.class);
    job.setNumMapTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenerateMark.class);
    job.setNumReduceTasks(reduceCnt);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = JobClient.runJob(job);
    return r;
}

From source file:org.hxx.hadoop.GeneratorRedHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
        boolean filter, boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment=" + segment);

    JobConf job = new NutchJob(getConf());
    job.setJarByClass(GeneratorRedHbase.class);
    job.setJobName("generate: from " + table + " "
            + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis()));
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (numLists == -1) {
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }/*www  . j  av a  2  s  .co m*/
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCENUM, numLists);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(CodeInputFormat.class);
    job.setNumMapTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenerateMark.class);
    job.setNumReduceTasks(numLists);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = null;
    try {
        r = JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }
    return r;
}

From source file:org.locationtech.geomesa.jobs.interop.mapred.FeatureWriterJob.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(FeatureCountJob.class);
    conf.setJobName("simple feature writing");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(ScalaSimpleFeature.class);

    conf.setMapperClass(MyMapper.class);
    conf.setNumReduceTasks(0);/*from   w  w w  . j  a  va 2  s. c o m*/

    conf.setInputFormat(GeoMesaInputFormat.class);
    conf.setOutputFormat(GeoMesaOutputFormat.class);

    Map<String, String> params = new HashMap<String, String>();
    params.put("instanceId", "myinstance");
    params.put("zookeepers", "zoo1,zoo2,zoo3");
    params.put("user", "myuser");
    params.put("password", "mypassword");
    params.put("tableName", "mycatalog");

    Query query = new Query("myfeature", ECQL.toFilter("BBOX(geom, -165,5,-50,75)"));

    GeoMesaInputFormat.configure(conf, params, query);

    Map<String, String> outParams = new HashMap<String, String>();
    outParams.put("instanceId", "myinstance");
    outParams.put("zookeepers", "zoo1,zoo2,zoo3");
    outParams.put("user", "myuser");
    outParams.put("password", "mypassword");
    outParams.put("tableName", "mycatalog_2");

    GeoMesaOutputFormat.configureDataStore(conf, outParams);

    JobClient.runJob(conf);
}

From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java

License:Open Source License

public int initJob(JobConf jobConf, String input, String output) throws Exception {
    JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);

    final Path inputPath = new Path(input);
    final FileSystem fs = inputPath.getFileSystem(conf);
    final Path qInputPath = fs.makeQualified(inputPath);

    /**// w  w  w. jav a2s  .  co m
     * Need to get all of the sample names/labels
     */
    JobConf cacheConf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);
    cacheConf.setJobName("CacheNorm2MapReduce");
    cacheConf.setNumReduceTasks(1); // Want ONE part file

    // Set up IdentityMapper
    SequenceFileInputFormat.setInputPaths(cacheConf, new Path(input));
    cacheConf.setInputFormat(SequenceFileInputFormat.class);
    cacheConf.setMapperClass(Norm2Mapper.class);
    cacheConf.setOutputKeyClass(StringDoublePairWritable.class);
    cacheConf.setOutputValueClass(SparseVectorWritable.class);

    // Set up IdentityReducer
    cacheConf.setReducerClass(IdentityReducer.class);
    cacheConf.setOutputFormat(SequenceFileOutputFormat.class);
    cacheConf.setNumReduceTasks(1);
    Path sfPath = FileUtils.createRemoteTempPath(fs, qInputPath.getParent());
    LOG.info(String.format("Generating feature vector SequenceFile path %s", sfPath.toString()));
    SequenceFileOutputFormat.setOutputPath(cacheConf, sfPath);
    JobClient.runJob(cacheConf);

    Path cachePath = new Path(sfPath.toString() + Path.SEPARATOR + "part-00000");

    // need to know the size (the reducer might be able to send this back via the Reporter, but how do we grab that info?
    StringDoublePairWritable key = new StringDoublePairWritable();
    int size = 0;
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, cachePath, conf);
    boolean hasNext = reader.next(key);
    while (hasNext) {
        size += 1;
        hasNext = reader.next(key);
    }
    try {
        reader.close();
    } catch (IOException ioe) {
        // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
        LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe);
    }

    //LOG.info(String.format("Caching model file %s", qInputPath.toString()));
    URI listURI = new URI(fs.makeQualified(cachePath).toString());
    DistributedCache.addCacheFile(listURI, conf);
    LOG.info(String.format("SequenceFile cache path %s (%s) with %d labels", listURI.toString(),
            cachePath.getName(), size));
    conf.set(CACHE_PATH, cachePath.getName());
    conf.setInt(DISTANCE_MATRIX_SIZE, size);

    /**
     * Main MapReduce Task of generating dot products
     */
    LOG.info("Generating distances");
    JobConf distanceConf = new JobConf(conf, CalculateCosineDistanceMatrix.class);
    distanceConf.setJobName("DistanceMapReduce");
    // Set up distance mapper
    SequenceFileInputFormat.setInputPaths(distanceConf, new Path(input));
    distanceConf.setInputFormat(SequenceFileInputFormat.class);
    distanceConf.setMapperClass(DistanceMap.class);
    distanceConf.setMapOutputKeyClass(Text.class);
    distanceConf.setMapOutputValueClass(SparseVectorWritable.class);

    // Set up reducer to merge lower-triangle results into a single dense distance vector
    distanceConf.setReducerClass(DistanceReducer.class);
    distanceConf.setOutputKeyClass(Text.class);
    distanceConf.setOutputValueClass(DenseVectorWritable.class);
    distanceConf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(distanceConf, new Path(output));
    JobClient.runJob(distanceConf);

    return 0;
}

From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java

License:Open Source License

@Test
public void testCombinerOutputClasses() throws IOException, KettleException {
    JobConf jobConf = createJobConf("./test-res/wordcount-mapper.ktr", "./test-res/wordcount-reducer.ktr",
            "./test-res/wordcount-reducer.ktr");

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(IntWritable.class);
    jobConf.setOutputValueClass(NullWritable.class);
    jobConf.setOutputValueClass(LongWritable.class);

    GenericTransCombiner combiner = new GenericTransCombiner();

    combiner.configure(jobConf);// w w  w  .  ja  v  a 2  s . com

    assertEquals(jobConf.getMapOutputKeyClass(), combiner.getOutClassK());
    assertEquals(jobConf.getMapOutputValueClass(), combiner.getOutClassV());
}

From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java

License:Open Source License

@Test
public void testReducerOutputClasses() throws IOException, KettleException {
    JobConf jobConf = createJobConf("./test-res/wordcount-mapper.ktr", "./test-res/wordcount-reducer.ktr",
            "./test-res/wordcount-reducer.ktr");

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(IntWritable.class);
    jobConf.setOutputValueClass(NullWritable.class);
    jobConf.setOutputValueClass(LongWritable.class);

    GenericTransReduce reducer = new GenericTransReduce();

    reducer.configure(jobConf);/*from  ww  w.  j a  v a 2  s .c o  m*/

    assertEquals(jobConf.getOutputKeyClass(), reducer.getOutClassK());
    assertEquals(jobConf.getOutputValueClass(), reducer.getOutClassV());
}