List of usage examples for org.apache.hadoop.mapred JobConf setInt
public void setInt(String name, int value)
name
property to an int
. From source file:org.commoncrawl.hadoop.io.deprecated.JetS3tARCSource.java
License:Apache License
/** * Sets the maximum number of times to try reading a file. * //ww w .j a v a 2s . c o m * <p> * Default is 4. * * @param job * the job to set the maximum number of retries for * @param maxTries * the maximum number of attempts per file */ public static final void setMaxRetries(JobConf job, int maxtries) { job.setInt(P_MAX_TRIES, maxtries); }
From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java
License:Open Source License
/** * main routine/*from ww w .j a va 2s . c o m*/ * * @param args */ public static void main(String[] args) { // amazon access key - passed on command line String accessKey = args[0]; // amazon secret key - passed on command line String secretKey = args[1]; // regular expression to match against - passed in command line String regEx = args[2]; // group number to extract int groupNumber = Integer.parseInt(args[3]); /** arc files names start with year then month **/ // we want to process all files uploaded in 2009 // so, we will use the prefix string "2009", // buy you could, for example pass in a more restrictive // pattern such as "2008/06". String inputPrefix = "2009"; LOG.info("Processing Path:" + inputPrefix); // allocate job config JobConf job = new JobConf(SampleHadoopJob.class); // set job name job.setJobName("Sample RegEx Job against path:" + inputPrefix); // set regular expression attributes job.set("mapred.mapper.regex", regEx); job.setInt("mapred.mapper.regex.group", groupNumber); // create temp file pth Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis()); LOG.info("Output for job " + job.getJobName() + " is:" + tempDir); // we are going to be using the JetS3ARCSource as an input source to // the ArcInputFormat. This input source uses the multi-threaded jets3 // library to request data from S3. /** setup s3 properties **/ // set the number of retries per ARC file. // we are setting this number to one, so if an IOException // occurs when processing an ARCFile, we are going to silently skip it // and continue processing the next ARC file. You should set this to be // a number LESS than mapred.max.tracker.failures (as defined in your // job config or hadoop-site.xml). Otherwise, your entire job could // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce // exhibits a failure condition specific to a single key or set of keys. JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); // set the number of files per split // set this number higher if the bucket contains lots of files, to reduce // the burden on the map-reduce system from tracking too many file splits. ARCSplitCalculator.setFilesPerSplit(job, 25); /** set up arc reader properties **/ // again, set the timeout to something reasonable, so that your entire job // will not hang if a single GET request fails to complete in a reasonable // amount of time ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, inputPrefix); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // now inform the job that it needs to use the ARCInputFormat job.setInputFormat(ARCInputFormat.class); // set up our map runner class // we use a map runner instead of a mapper here to give us an extra level of // control over how we handle errors. When running a large job against // the crawl corpus which may contain hunders of thousands of ARC files, it // is extremely important to reduce the risks of abnormal job termination. job.setMapRunnerClass(SampleHadoopJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } }
From source file:org.commoncrawl.mapred.segmenter.Segmenter.java
License:Open Source License
public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath, Path finalOutputPath) {/*from www . ja v a 2 s . c o m*/ try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); final Path tempOutputDir = new Path( CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis()); JobConf job = new JobConf(conf); // compute crawlers string ... String crawlers = new String(); for (int i = 0; i < crawlerArray.length; ++i) { if (i != 0) crawlers += ","; crawlers += crawlerArray[i]; } LOG.info("Segment Generator: crawlers:" + crawlers); job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers); LOG.info("Crawler Count:" + crawlerArray.length); job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length); LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER); job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER); job.setJobName("Generate Segments"); for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) { LOG.info("Adding File:" + candidate.getPath()); job.addInputPath(candidate.getPath()); } // multi file merger job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class); job.setMapOutputValueClass(SegmentGeneratorItemBundle.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SegmenterReducer.class); job.setPartitionerClass(BundleKeyPartitioner.class); job.setOutputKeyComparatorClass(BundleKeyComparator.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputPath(tempOutputDir); job.setNumTasksToExecutePerJvm(1000); job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER); LOG.info("Running Segmenter OutputDir:" + tempOutputDir); JobClient.runJob(job); LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:" + finalOutputPath); fs.rename(tempOutputDir, finalOutputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } }
From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java
License:Apache License
@Parameters public static Collection<Object[]> configs() throws IOException { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(SplittableTextInputFormat.class); conf.setOutputFormat(EsOutputFormat.class); conf.setReducerClass(IdentityReducer.class); HadoopCfgUtils.setGenericOptions(conf); conf.setNumMapTasks(2);//from w w w.j a va2 s .c o m conf.setInt("actual.splits", 2); conf.setNumReduceTasks(0); JobConf standard = new JobConf(conf); standard.setMapperClass(TabMapper.class); standard.setMapOutputValueClass(LinkedMapWritable.class); standard.set(ConfigurationOptions.ES_INPUT_JSON, "false"); FileInputFormat.setInputPaths(standard, new Path(TestUtils.gibberishDat(conf))); JobConf json = new JobConf(conf); json.setMapperClass(IdentityMapper.class); json.setMapOutputValueClass(Text.class); json.set(ConfigurationOptions.ES_INPUT_JSON, "true"); FileInputFormat.setInputPaths(json, new Path(TestUtils.gibberishJson(conf))); return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } }); }
From source file:org.elasticsearch.hadoop.integration.mr.AbstractMROldApiSaveTest.java
License:Apache License
@Parameters public static Collection<Object[]> configs() { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(SplittableTextInputFormat.class); conf.setOutputFormat(EsOutputFormat.class); conf.setReducerClass(IdentityReducer.class); HadoopCfgUtils.setGenericOptions(conf); conf.setNumMapTasks(2);//from w ww . j a v a2s. co m conf.setInt("actual.splits", 2); conf.setNumReduceTasks(0); JobConf standard = new JobConf(conf); standard.setMapperClass(TabMapper.class); standard.setMapOutputValueClass(LinkedMapWritable.class); standard.set(ConfigurationOptions.ES_INPUT_JSON, "false"); FileInputFormat.setInputPaths(standard, new Path(TestUtils.sampleArtistsDat(conf))); JobConf json = new JobConf(conf); json.setMapperClass(IdentityMapper.class); json.setMapOutputValueClass(Text.class); json.set(ConfigurationOptions.ES_INPUT_JSON, "true"); FileInputFormat.setInputPaths(json, new Path(TestUtils.sampleArtistsJson(conf))); return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } }); }
From source file:org.gbif.ocurrence.index.solr.ConfTester.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), ConfTester.class); job.setNumMapTasks(numMapper);/* w w w . ja v a 2 s.c om*/ job.setNumReduceTasks(numReducer); job.setMapperClass(ConfTester.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(ConfTester.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(ConfTester.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java
License:Open Source License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails./* w w w. j a v a 2 s . com*/ */ public void testMultiReducerWholeMapReduceExecution() throws Exception { GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT); ggfs.mkdirs(inDir); GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(GridHadoopWordCount2.class); GridFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceTest.java
License:Open Source License
/** * Tests whole job execution with all phases in all combination of new and old versions of API. * @throws Exception If fails.//from w w w .j av a2 s . c o m */ public void testWholeMapReduceExecution() throws Exception { GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT); ggfs.mkdirs(inDir); GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "red", 100000, "blue", 200000, "green", 150000, "yellow", 70000); for (int i = 0; i < 8; i++) { ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true); boolean useNewMapper = (i & 1) == 0; boolean useNewCombiner = (i & 2) == 0; boolean useNewReducer = (i & 4) == 0; JobConf jobConf = new JobConf(); jobConf.set(JOB_COUNTER_WRITER_PROPERTY, GridHadoopFSCounterWriter.class.getName()); //To split into about 40 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); GridHadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer); Job job = Job.getInstance(jobConf); GridHadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT)); job.setJarByClass(GridHadoopWordCount2.class); GridHadoopJobId jobId = new GridHadoopJobId(UUID.randomUUID(), 1); GridFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration())); fut.get(); checkJobStatistics(jobId); assertEquals( "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: " + useNewReducer, "blue\t200000\n" + "green\t150000\n" + "red\t100000\n" + "yellow\t70000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000")); } }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, long topN, int reduceCnt, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: from table=" + table + " segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }/*w ww . ja va 2s. co m*/ if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(TableTopInputFormat.class);// ? job.setMapperClass(GenerateMark.class);// generate? job.setPartitionerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }//w w w .j a v a 2 s.com if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }