Example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java

License:Open Source License

/**
 * main routine// www . j a v  a 2  s.c  o m
 * 
 * @param args
 */
public static void main(String[] args) {

    // amazon access key - passed on command line
    String accessKey = args[0];
    // amazon secret key - passed on command line
    String secretKey = args[1];
    // regular expression to match against - passed in command line
    String regEx = args[2];
    // group number to extract
    int groupNumber = Integer.parseInt(args[3]);

    /** arc files names start with year then month **/
    // we want to process all files uploaded in 2009
    // so, we will use the prefix string "2009",
    // buy you could, for example pass in a more restrictive
    // pattern such as "2008/06".

    String inputPrefix = "2009";

    LOG.info("Processing Path:" + inputPrefix);

    // allocate job config
    JobConf job = new JobConf(SampleHadoopJob.class);
    // set job name
    job.setJobName("Sample RegEx Job against path:" + inputPrefix);
    // set regular expression attributes
    job.set("mapred.mapper.regex", regEx);
    job.setInt("mapred.mapper.regex.group", groupNumber);

    // create temp file pth
    Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis());

    LOG.info("Output for job " + job.getJobName() + " is:" + tempDir);

    // we are going to be using the JetS3ARCSource as an input source to
    // the ArcInputFormat. This input source uses the multi-threaded jets3
    // library to request data from S3.

    /** setup s3 properties **/

    // set the number of retries per ARC file.
    // we are setting this number to one, so if an IOException
    // occurs when processing an ARCFile, we are going to silently skip it
    // and continue processing the next ARC file. You should set this to be
    // a number LESS than mapred.max.tracker.failures (as defined in your
    // job config or hadoop-site.xml). Otherwise, your entire job could
    // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce
    // exhibits a failure condition specific to a single key or set of keys.
    JetS3tARCSource.setMaxRetries(job, 1);

    // set up S3 credentials ...
    JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
    JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);

    // set the number of files per split
    // set this number higher if the bucket contains lots of files, to reduce
    // the burden on the map-reduce system from tracking too many file splits.
    ARCSplitCalculator.setFilesPerSplit(job, 25);

    /** set up arc reader properties **/

    // again, set the timeout to something reasonable, so that your entire job
    // will not hang if a single GET request fails to complete in a reasonable
    // amount of time
    ArcFileReader.setIOTimeoutValue(30000);
    // set input prefixes ...
    JetS3tARCSource.setInputPrefixes(job, inputPrefix);
    // and S3 bucket name ...
    JetS3tARCSource.setBucketName(job, "commoncrawl");
    // and setup arc source for ArcInputFormat
    ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);

    // now inform the job that it needs to use the ARCInputFormat
    job.setInputFormat(ARCInputFormat.class);

    // set up our map runner class
    // we use a map runner instead of a mapper here to give us an extra level of
    // control over how we handle errors. When running a large job against
    // the crawl corpus which may contain hunders of thousands of ARC files, it
    // is extremely important to reduce the risks of abnormal job termination.
    job.setMapRunnerClass(SampleHadoopJob.class);

    // setup reducer (identity in this case ... )
    job.setReducerClass(IdentityReducer.class);
    // standard output format ...
    job.setOutputFormat(SequenceFileOutputFormat.class);
    // set output path
    job.setOutputPath(tempDir);
    // map output types
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // run the job ...
    try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
    }
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {/*from  w  w  w.ja  va  2s . c  o  m*/
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java

License:Apache License

@Parameters
public static Collection<Object[]> configs() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(SplittableTextInputFormat.class);
    conf.setOutputFormat(EsOutputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumMapTasks(2);/*from   w  ww . j av a  2 s  .  com*/
    conf.setInt("actual.splits", 2);
    conf.setNumReduceTasks(0);

    JobConf standard = new JobConf(conf);
    standard.setMapperClass(TabMapper.class);
    standard.setMapOutputValueClass(LinkedMapWritable.class);
    standard.set(ConfigurationOptions.ES_INPUT_JSON, "false");
    FileInputFormat.setInputPaths(standard, new Path(TestUtils.gibberishDat(conf)));

    JobConf json = new JobConf(conf);
    json.setMapperClass(IdentityMapper.class);
    json.setMapOutputValueClass(Text.class);
    json.set(ConfigurationOptions.ES_INPUT_JSON, "true");
    FileInputFormat.setInputPaths(json, new Path(TestUtils.gibberishJson(conf)));

    return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } });
}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractMROldApiSaveTest.java

License:Apache License

@Parameters
public static Collection<Object[]> configs() {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(SplittableTextInputFormat.class);
    conf.setOutputFormat(EsOutputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumMapTasks(2);/* w ww  .  j av a2  s  .  c  om*/
    conf.setInt("actual.splits", 2);
    conf.setNumReduceTasks(0);

    JobConf standard = new JobConf(conf);
    standard.setMapperClass(TabMapper.class);
    standard.setMapOutputValueClass(LinkedMapWritable.class);
    standard.set(ConfigurationOptions.ES_INPUT_JSON, "false");
    FileInputFormat.setInputPaths(standard, new Path(TestUtils.sampleArtistsDat(conf)));

    JobConf json = new JobConf(conf);
    json.setMapperClass(IdentityMapper.class);
    json.setMapOutputValueClass(Text.class);
    json.set(ConfigurationOptions.ES_INPUT_JSON, "true");
    FileInputFormat.setInputPaths(json, new Path(TestUtils.sampleArtistsJson(conf)));

    return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } });
}

From source file:org.elasticsearch.hadoop.integration.mr.MROldApiSaveTest.java

License:Apache License

@Test
public void testBasicSave() throws Exception {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(ESOutputFormat.class);
    conf.setMapOutputValueClass(MapWritable.class);
    conf.setMapperClass(JsonMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    FileInputFormat.setInputPaths(conf, new Path("src/test/resources/artists.dat"));
    conf.set("es.resource", "mroldapi/save");

    JobClient.runJob(conf);/*from  ww  w .j a v a  2  s.c  om*/
}

From source file:org.elasticsearch.hadoop.integration.mr.MROldApiSaveTest.java

License:Apache License

@Test(expected = IllegalArgumentException.class)
public void testIndexAutoCreateDisabled() throws Exception {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(ESOutputFormat.class);
    conf.setMapOutputValueClass(MapWritable.class);
    conf.setMapperClass(JsonMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    FileInputFormat.setInputPaths(conf, new Path("src/test/resources/artists.dat"));
    conf.set(ConfigurationOptions.ES_RESOURCE, "mroldapi/non-existing");
    conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, "no");

    JobClient.runJob(conf);/*from w  w w  .  ja v  a  2  s.  c  om*/
}

From source file:org.gbif.ocurrence.index.solr.ConfTester.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), ConfTester.class);
    job.setNumMapTasks(numMapper);//from w  ww.ja  v a2  s  .  c  om
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(ConfTester.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(ConfTester.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(ConfTester.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopV2JobSelfTest.java

License:Open Source License

/**
 * Tests that {@link GridHadoopJob} provides wrapped serializer if it's set in configuration.
 *
 * @throws GridException If fails./*  w  w w.  j av a  2  s  .c  om*/
 */
public void testCustomSerializationApplying() throws GridException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    GridHadoopJob job = new GridHadoopV2Job(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log);

    GridHadoopTaskContext taskCtx = job
            .getTaskContext(new GridHadoopTaskInfo(GridHadoopTaskType.MAP, null, 0, 0, null));

    GridHadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.hxx.hadoop.GeneratorHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter,
        boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment=" + segment);

    JobConf job = new NutchJob(getConf());
    // job.setJarByClass(GeneratorHbase.class);
    job.setJobName("generate:" + table + " "
            + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment);
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (reduceCnt == -1) {
        reduceCnt = job.getNumMapTasks(); // a partition per fetch task
    }/*from  ww  w  .  ja va  2s. c  o  m*/
    if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) {
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        reduceCnt = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCECNT, reduceCnt);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(CodeInputFormat.class);
    job.setNumMapTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenerateMark.class);
    job.setNumReduceTasks(reduceCnt);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = JobClient.runJob(job);
    return r;
}

From source file:org.hxx.hadoop.GeneratorRedHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
        boolean filter, boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment=" + segment);

    JobConf job = new NutchJob(getConf());
    job.setJarByClass(GeneratorRedHbase.class);
    job.setJobName("generate: from " + table + " "
            + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis()));
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (numLists == -1) {
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }/* w w w .  j  av  a2  s .c om*/
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCENUM, numLists);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(CodeInputFormat.class);
    job.setNumMapTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenerateMark.class);
    job.setNumReduceTasks(numLists);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = null;
    try {
        r = JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }
    return r;
}