Example usage for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value)

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:org.commoncrawl.hadoop.io.deprecated.JetS3tARCSource.java

License:Apache License

/**
 * Sets the maximum number of times to try reading a file.
 * //ww w .j a  v  a 2s .  c  o m
 * <p>
 * Default is 4.
 * 
 * @param job
 *          the job to set the maximum number of retries for
 * @param maxTries
 *          the maximum number of attempts per file
 */
public static final void setMaxRetries(JobConf job, int maxtries) {
    job.setInt(P_MAX_TRIES, maxtries);
}

From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java

License:Open Source License

/**
 * main routine/*from  ww w  .j  a va 2s . c  o m*/
 * 
 * @param args
 */
public static void main(String[] args) {

    // amazon access key - passed on command line
    String accessKey = args[0];
    // amazon secret key - passed on command line
    String secretKey = args[1];
    // regular expression to match against - passed in command line
    String regEx = args[2];
    // group number to extract
    int groupNumber = Integer.parseInt(args[3]);

    /** arc files names start with year then month **/
    // we want to process all files uploaded in 2009
    // so, we will use the prefix string "2009",
    // buy you could, for example pass in a more restrictive
    // pattern such as "2008/06".

    String inputPrefix = "2009";

    LOG.info("Processing Path:" + inputPrefix);

    // allocate job config
    JobConf job = new JobConf(SampleHadoopJob.class);
    // set job name
    job.setJobName("Sample RegEx Job against path:" + inputPrefix);
    // set regular expression attributes
    job.set("mapred.mapper.regex", regEx);
    job.setInt("mapred.mapper.regex.group", groupNumber);

    // create temp file pth
    Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis());

    LOG.info("Output for job " + job.getJobName() + " is:" + tempDir);

    // we are going to be using the JetS3ARCSource as an input source to
    // the ArcInputFormat. This input source uses the multi-threaded jets3
    // library to request data from S3.

    /** setup s3 properties **/

    // set the number of retries per ARC file.
    // we are setting this number to one, so if an IOException
    // occurs when processing an ARCFile, we are going to silently skip it
    // and continue processing the next ARC file. You should set this to be
    // a number LESS than mapred.max.tracker.failures (as defined in your
    // job config or hadoop-site.xml). Otherwise, your entire job could
    // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce
    // exhibits a failure condition specific to a single key or set of keys.
    JetS3tARCSource.setMaxRetries(job, 1);

    // set up S3 credentials ...
    JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
    JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);

    // set the number of files per split
    // set this number higher if the bucket contains lots of files, to reduce
    // the burden on the map-reduce system from tracking too many file splits.
    ARCSplitCalculator.setFilesPerSplit(job, 25);

    /** set up arc reader properties **/

    // again, set the timeout to something reasonable, so that your entire job
    // will not hang if a single GET request fails to complete in a reasonable
    // amount of time
    ArcFileReader.setIOTimeoutValue(30000);
    // set input prefixes ...
    JetS3tARCSource.setInputPrefixes(job, inputPrefix);
    // and S3 bucket name ...
    JetS3tARCSource.setBucketName(job, "commoncrawl");
    // and setup arc source for ArcInputFormat
    ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);

    // now inform the job that it needs to use the ARCInputFormat
    job.setInputFormat(ARCInputFormat.class);

    // set up our map runner class
    // we use a map runner instead of a mapper here to give us an extra level of
    // control over how we handle errors. When running a large job against
    // the crawl corpus which may contain hunders of thousands of ARC files, it
    // is extremely important to reduce the risks of abnormal job termination.
    job.setMapRunnerClass(SampleHadoopJob.class);

    // setup reducer (identity in this case ... )
    job.setReducerClass(IdentityReducer.class);
    // standard output format ...
    job.setOutputFormat(SequenceFileOutputFormat.class);
    // set output path
    job.setOutputPath(tempDir);
    // map output types
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // run the job ...
    try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
    }
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {/*from  www .  ja v a 2  s  . c o m*/
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java

License:Apache License

@Parameters
public static Collection<Object[]> configs() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(SplittableTextInputFormat.class);
    conf.setOutputFormat(EsOutputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumMapTasks(2);//from   w w w.j a  va2 s .c  o  m
    conf.setInt("actual.splits", 2);
    conf.setNumReduceTasks(0);

    JobConf standard = new JobConf(conf);
    standard.setMapperClass(TabMapper.class);
    standard.setMapOutputValueClass(LinkedMapWritable.class);
    standard.set(ConfigurationOptions.ES_INPUT_JSON, "false");
    FileInputFormat.setInputPaths(standard, new Path(TestUtils.gibberishDat(conf)));

    JobConf json = new JobConf(conf);
    json.setMapperClass(IdentityMapper.class);
    json.setMapOutputValueClass(Text.class);
    json.set(ConfigurationOptions.ES_INPUT_JSON, "true");
    FileInputFormat.setInputPaths(json, new Path(TestUtils.gibberishJson(conf)));

    return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } });
}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractMROldApiSaveTest.java

License:Apache License

@Parameters
public static Collection<Object[]> configs() {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(SplittableTextInputFormat.class);
    conf.setOutputFormat(EsOutputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumMapTasks(2);//from w  ww  .  j a  v a2s. co  m
    conf.setInt("actual.splits", 2);
    conf.setNumReduceTasks(0);

    JobConf standard = new JobConf(conf);
    standard.setMapperClass(TabMapper.class);
    standard.setMapOutputValueClass(LinkedMapWritable.class);
    standard.set(ConfigurationOptions.ES_INPUT_JSON, "false");
    FileInputFormat.setInputPaths(standard, new Path(TestUtils.sampleArtistsDat(conf)));

    JobConf json = new JobConf(conf);
    json.setMapperClass(IdentityMapper.class);
    json.setMapOutputValueClass(Text.class);
    json.set(ConfigurationOptions.ES_INPUT_JSON, "true");
    FileInputFormat.setInputPaths(json, new Path(TestUtils.sampleArtistsJson(conf)));

    return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } });
}

From source file:org.gbif.ocurrence.index.solr.ConfTester.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), ConfTester.class);
    job.setNumMapTasks(numMapper);/*  w w w .  ja  v a  2  s.c om*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(ConfTester.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(ConfTester.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(ConfTester.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java

License:Open Source License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./* w w  w.  j a v a  2 s  . com*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT);

    ggfs.mkdirs(inDir);

    GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(GridHadoopWordCount2.class);

        GridFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceTest.java

License:Open Source License

/**
 * Tests whole job execution with all phases in all combination of new and old versions of API.
 * @throws Exception If fails.//from  w  w w  .j av a2 s .  c  o  m
 */
public void testWholeMapReduceExecution() throws Exception {
    GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT);

    ggfs.mkdirs(inDir);

    GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "red", 100000, "blue", 200000, "green", 150000, "yellow", 70000);

    for (int i = 0; i < 8; i++) {
        ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true);

        boolean useNewMapper = (i & 1) == 0;
        boolean useNewCombiner = (i & 2) == 0;
        boolean useNewReducer = (i & 4) == 0;

        JobConf jobConf = new JobConf();

        jobConf.set(JOB_COUNTER_WRITER_PROPERTY, GridHadoopFSCounterWriter.class.getName());

        //To split into about 40 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer);

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT));

        job.setJarByClass(GridHadoopWordCount2.class);

        GridHadoopJobId jobId = new GridHadoopJobId(UUID.randomUUID(), 1);

        GridFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));

        fut.get();

        checkJobStatistics(jobId);

        assertEquals(
                "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: "
                        + useNewReducer,
                "blue\t200000\n" + "green\t150000\n" + "red\t100000\n" + "yellow\t70000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000"));
    }
}

From source file:org.hxx.hadoop.GeneratorHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, long topN, int reduceCnt, boolean filter,
        boolean norm, boolean force) throws IOException {
    LOG.info("Generator: from table=" + table + " segment=" + segment);

    JobConf job = new NutchJob(getConf());
    // job.setJarByClass(GeneratorHbase.class);
    job.setJobName("generate:" + table + " "
            + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment);

    if (reduceCnt == -1) {
        reduceCnt = job.getNumMapTasks(); // a partition per fetch task
    }/*w  ww  .  ja va 2s.  co  m*/
    if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) {
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        reduceCnt = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCECNT, reduceCnt);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(TableTopInputFormat.class);// ?
    job.setMapperClass(GenerateMark.class);// generate?

    job.setPartitionerClass(GenerateMark.class);
    job.setNumReduceTasks(reduceCnt);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = JobClient.runJob(job);
    return r;
}

From source file:org.hxx.hadoop.GeneratorHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter,
        boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment=" + segment);

    JobConf job = new NutchJob(getConf());
    // job.setJarByClass(GeneratorHbase.class);
    job.setJobName("generate:" + table + " "
            + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment);
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (reduceCnt == -1) {
        reduceCnt = job.getNumMapTasks(); // a partition per fetch task
    }//w w  w .j  a  v a  2  s.com
    if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) {
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        reduceCnt = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCECNT, reduceCnt);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(CodeInputFormat.class);
    job.setNumMapTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenerateMark.class);
    job.setNumReduceTasks(reduceCnt);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = JobClient.runJob(job);
    return r;
}