List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass)
From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java
License:Open Source License
/** * main routine// www . j a v a 2 s.c o m * * @param args */ public static void main(String[] args) { // amazon access key - passed on command line String accessKey = args[0]; // amazon secret key - passed on command line String secretKey = args[1]; // regular expression to match against - passed in command line String regEx = args[2]; // group number to extract int groupNumber = Integer.parseInt(args[3]); /** arc files names start with year then month **/ // we want to process all files uploaded in 2009 // so, we will use the prefix string "2009", // buy you could, for example pass in a more restrictive // pattern such as "2008/06". String inputPrefix = "2009"; LOG.info("Processing Path:" + inputPrefix); // allocate job config JobConf job = new JobConf(SampleHadoopJob.class); // set job name job.setJobName("Sample RegEx Job against path:" + inputPrefix); // set regular expression attributes job.set("mapred.mapper.regex", regEx); job.setInt("mapred.mapper.regex.group", groupNumber); // create temp file pth Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis()); LOG.info("Output for job " + job.getJobName() + " is:" + tempDir); // we are going to be using the JetS3ARCSource as an input source to // the ArcInputFormat. This input source uses the multi-threaded jets3 // library to request data from S3. /** setup s3 properties **/ // set the number of retries per ARC file. // we are setting this number to one, so if an IOException // occurs when processing an ARCFile, we are going to silently skip it // and continue processing the next ARC file. You should set this to be // a number LESS than mapred.max.tracker.failures (as defined in your // job config or hadoop-site.xml). Otherwise, your entire job could // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce // exhibits a failure condition specific to a single key or set of keys. JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); // set the number of files per split // set this number higher if the bucket contains lots of files, to reduce // the burden on the map-reduce system from tracking too many file splits. ARCSplitCalculator.setFilesPerSplit(job, 25); /** set up arc reader properties **/ // again, set the timeout to something reasonable, so that your entire job // will not hang if a single GET request fails to complete in a reasonable // amount of time ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, inputPrefix); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // now inform the job that it needs to use the ARCInputFormat job.setInputFormat(ARCInputFormat.class); // set up our map runner class // we use a map runner instead of a mapper here to give us an extra level of // control over how we handle errors. When running a large job against // the crawl corpus which may contain hunders of thousands of ARC files, it // is extremely important to reduce the risks of abnormal job termination. job.setMapRunnerClass(SampleHadoopJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } }
From source file:org.commoncrawl.mapred.segmenter.Segmenter.java
License:Open Source License
public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath, Path finalOutputPath) {/*from w w w.ja va 2s . c o m*/ try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); final Path tempOutputDir = new Path( CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis()); JobConf job = new JobConf(conf); // compute crawlers string ... String crawlers = new String(); for (int i = 0; i < crawlerArray.length; ++i) { if (i != 0) crawlers += ","; crawlers += crawlerArray[i]; } LOG.info("Segment Generator: crawlers:" + crawlers); job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers); LOG.info("Crawler Count:" + crawlerArray.length); job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length); LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER); job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER); job.setJobName("Generate Segments"); for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) { LOG.info("Adding File:" + candidate.getPath()); job.addInputPath(candidate.getPath()); } // multi file merger job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class); job.setMapOutputValueClass(SegmentGeneratorItemBundle.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SegmenterReducer.class); job.setPartitionerClass(BundleKeyPartitioner.class); job.setOutputKeyComparatorClass(BundleKeyComparator.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputPath(tempOutputDir); job.setNumTasksToExecutePerJvm(1000); job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER); LOG.info("Running Segmenter OutputDir:" + tempOutputDir); JobClient.runJob(job); LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:" + finalOutputPath); fs.rename(tempOutputDir, finalOutputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } }
From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java
License:Apache License
@Parameters public static Collection<Object[]> configs() throws IOException { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(SplittableTextInputFormat.class); conf.setOutputFormat(EsOutputFormat.class); conf.setReducerClass(IdentityReducer.class); HadoopCfgUtils.setGenericOptions(conf); conf.setNumMapTasks(2);/*from w ww . j av a 2 s . com*/ conf.setInt("actual.splits", 2); conf.setNumReduceTasks(0); JobConf standard = new JobConf(conf); standard.setMapperClass(TabMapper.class); standard.setMapOutputValueClass(LinkedMapWritable.class); standard.set(ConfigurationOptions.ES_INPUT_JSON, "false"); FileInputFormat.setInputPaths(standard, new Path(TestUtils.gibberishDat(conf))); JobConf json = new JobConf(conf); json.setMapperClass(IdentityMapper.class); json.setMapOutputValueClass(Text.class); json.set(ConfigurationOptions.ES_INPUT_JSON, "true"); FileInputFormat.setInputPaths(json, new Path(TestUtils.gibberishJson(conf))); return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } }); }
From source file:org.elasticsearch.hadoop.integration.mr.AbstractMROldApiSaveTest.java
License:Apache License
@Parameters public static Collection<Object[]> configs() { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(SplittableTextInputFormat.class); conf.setOutputFormat(EsOutputFormat.class); conf.setReducerClass(IdentityReducer.class); HadoopCfgUtils.setGenericOptions(conf); conf.setNumMapTasks(2);/* w ww . j av a2 s . c om*/ conf.setInt("actual.splits", 2); conf.setNumReduceTasks(0); JobConf standard = new JobConf(conf); standard.setMapperClass(TabMapper.class); standard.setMapOutputValueClass(LinkedMapWritable.class); standard.set(ConfigurationOptions.ES_INPUT_JSON, "false"); FileInputFormat.setInputPaths(standard, new Path(TestUtils.sampleArtistsDat(conf))); JobConf json = new JobConf(conf); json.setMapperClass(IdentityMapper.class); json.setMapOutputValueClass(Text.class); json.set(ConfigurationOptions.ES_INPUT_JSON, "true"); FileInputFormat.setInputPaths(json, new Path(TestUtils.sampleArtistsJson(conf))); return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } }); }
From source file:org.elasticsearch.hadoop.integration.mr.MROldApiSaveTest.java
License:Apache License
@Test public void testBasicSave() throws Exception { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(ESOutputFormat.class); conf.setMapOutputValueClass(MapWritable.class); conf.setMapperClass(JsonMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setBoolean("mapred.used.genericoptionsparser", true); FileInputFormat.setInputPaths(conf, new Path("src/test/resources/artists.dat")); conf.set("es.resource", "mroldapi/save"); JobClient.runJob(conf);/*from ww w .j a v a 2 s.c om*/ }
From source file:org.elasticsearch.hadoop.integration.mr.MROldApiSaveTest.java
License:Apache License
@Test(expected = IllegalArgumentException.class) public void testIndexAutoCreateDisabled() throws Exception { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(ESOutputFormat.class); conf.setMapOutputValueClass(MapWritable.class); conf.setMapperClass(JsonMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setBoolean("mapred.used.genericoptionsparser", true); FileInputFormat.setInputPaths(conf, new Path("src/test/resources/artists.dat")); conf.set(ConfigurationOptions.ES_RESOURCE, "mroldapi/non-existing"); conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, "no"); JobClient.runJob(conf);/*from w w w . ja v a 2 s. c om*/ }
From source file:org.gbif.ocurrence.index.solr.ConfTester.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), ConfTester.class); job.setNumMapTasks(numMapper);//from w ww.ja v a2 s . c om job.setNumReduceTasks(numReducer); job.setMapperClass(ConfTester.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(ConfTester.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(ConfTester.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopV2JobSelfTest.java
License:Open Source License
/** * Tests that {@link GridHadoopJob} provides wrapped serializer if it's set in configuration. * * @throws GridException If fails./* w w w. j av a 2 s .c om*/ */ public void testCustomSerializationApplying() throws GridException { JobConf cfg = new JobConf(); cfg.setMapOutputKeyClass(IntWritable.class); cfg.setMapOutputValueClass(Text.class); cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); GridHadoopJob job = new GridHadoopV2Job(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log); GridHadoopTaskContext taskCtx = job .getTaskContext(new GridHadoopTaskInfo(GridHadoopTaskType.MAP, null, 0, 0, null)); GridHadoopSerialization ser = taskCtx.keySerialization(); assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName()); DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0])); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); ser = taskCtx.valueSerialization(); assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName()); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }/*from ww w . ja va 2s. c o m*/ if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorRedHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); job.setJarByClass(GeneratorRedHbase.class); job.setJobName("generate: from " + table + " " + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis())); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (numLists == -1) { numLists = job.getNumMapTasks(); // a partition per fetch task }/* w w w . j av a2 s .c om*/ if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCENUM, numLists); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(numLists); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = null; try { r = JobClient.runJob(job); } catch (IOException e) { throw e; } return r; }