List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass)
From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java
License:Open Source License
/** * main routine/*from ww w . j a v a2 s . co m*/ * * @param args */ public static void main(String[] args) { // amazon access key - passed on command line String accessKey = args[0]; // amazon secret key - passed on command line String secretKey = args[1]; // regular expression to match against - passed in command line String regEx = args[2]; // group number to extract int groupNumber = Integer.parseInt(args[3]); /** arc files names start with year then month **/ // we want to process all files uploaded in 2009 // so, we will use the prefix string "2009", // buy you could, for example pass in a more restrictive // pattern such as "2008/06". String inputPrefix = "2009"; LOG.info("Processing Path:" + inputPrefix); // allocate job config JobConf job = new JobConf(SampleHadoopJob.class); // set job name job.setJobName("Sample RegEx Job against path:" + inputPrefix); // set regular expression attributes job.set("mapred.mapper.regex", regEx); job.setInt("mapred.mapper.regex.group", groupNumber); // create temp file pth Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis()); LOG.info("Output for job " + job.getJobName() + " is:" + tempDir); // we are going to be using the JetS3ARCSource as an input source to // the ArcInputFormat. This input source uses the multi-threaded jets3 // library to request data from S3. /** setup s3 properties **/ // set the number of retries per ARC file. // we are setting this number to one, so if an IOException // occurs when processing an ARCFile, we are going to silently skip it // and continue processing the next ARC file. You should set this to be // a number LESS than mapred.max.tracker.failures (as defined in your // job config or hadoop-site.xml). Otherwise, your entire job could // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce // exhibits a failure condition specific to a single key or set of keys. JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); // set the number of files per split // set this number higher if the bucket contains lots of files, to reduce // the burden on the map-reduce system from tracking too many file splits. ARCSplitCalculator.setFilesPerSplit(job, 25); /** set up arc reader properties **/ // again, set the timeout to something reasonable, so that your entire job // will not hang if a single GET request fails to complete in a reasonable // amount of time ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, inputPrefix); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // now inform the job that it needs to use the ARCInputFormat job.setInputFormat(ARCInputFormat.class); // set up our map runner class // we use a map runner instead of a mapper here to give us an extra level of // control over how we handle errors. When running a large job against // the crawl corpus which may contain hunders of thousands of ARC files, it // is extremely important to reduce the risks of abnormal job termination. job.setMapRunnerClass(SampleHadoopJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } }
From source file:org.commoncrawl.mapred.segmenter.Segmenter.java
License:Open Source License
public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath, Path finalOutputPath) {/*from www. j av a 2 s. c om*/ try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); final Path tempOutputDir = new Path( CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis()); JobConf job = new JobConf(conf); // compute crawlers string ... String crawlers = new String(); for (int i = 0; i < crawlerArray.length; ++i) { if (i != 0) crawlers += ","; crawlers += crawlerArray[i]; } LOG.info("Segment Generator: crawlers:" + crawlers); job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers); LOG.info("Crawler Count:" + crawlerArray.length); job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length); LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER); job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER); job.setJobName("Generate Segments"); for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) { LOG.info("Adding File:" + candidate.getPath()); job.addInputPath(candidate.getPath()); } // multi file merger job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class); job.setMapOutputValueClass(SegmentGeneratorItemBundle.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SegmenterReducer.class); job.setPartitionerClass(BundleKeyPartitioner.class); job.setOutputKeyComparatorClass(BundleKeyComparator.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputPath(tempOutputDir); job.setNumTasksToExecutePerJvm(1000); job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER); LOG.info("Running Segmenter OutputDir:" + tempOutputDir); JobClient.runJob(job); LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:" + finalOutputPath); fs.rename(tempOutputDir, finalOutputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } }
From source file:org.gbif.ocurrence.index.solr.ConfTester.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), ConfTester.class); job.setNumMapTasks(numMapper);/*from ww w . ja va2 s . c o m*/ job.setNumReduceTasks(numReducer); job.setMapperClass(ConfTester.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(ConfTester.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(ConfTester.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopV2JobSelfTest.java
License:Open Source License
/** * Tests that {@link GridHadoopJob} provides wrapped serializer if it's set in configuration. * * @throws GridException If fails./*from ww w . j a v a 2 s. c o m*/ */ public void testCustomSerializationApplying() throws GridException { JobConf cfg = new JobConf(); cfg.setMapOutputKeyClass(IntWritable.class); cfg.setMapOutputValueClass(Text.class); cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); GridHadoopJob job = new GridHadoopV2Job(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log); GridHadoopTaskContext taskCtx = job .getTaskContext(new GridHadoopTaskInfo(GridHadoopTaskType.MAP, null, 0, 0, null)); GridHadoopSerialization ser = taskCtx.keySerialization(); assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName()); DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0])); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); ser = taskCtx.valueSerialization(); assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName()); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }/*from w w w . ja va 2 s .com*/ if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorRedHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); job.setJarByClass(GeneratorRedHbase.class); job.setJobName("generate: from " + table + " " + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis())); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (numLists == -1) { numLists = job.getNumMapTasks(); // a partition per fetch task }/*www . j av a 2 s .co m*/ if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCENUM, numLists); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(numLists); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = null; try { r = JobClient.runJob(job); } catch (IOException e) { throw e; } return r; }
From source file:org.locationtech.geomesa.jobs.interop.mapred.FeatureWriterJob.java
License:Open Source License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(FeatureCountJob.class); conf.setJobName("simple feature writing"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(ScalaSimpleFeature.class); conf.setMapperClass(MyMapper.class); conf.setNumReduceTasks(0);/*from w w w . j a va 2 s. c o m*/ conf.setInputFormat(GeoMesaInputFormat.class); conf.setOutputFormat(GeoMesaOutputFormat.class); Map<String, String> params = new HashMap<String, String>(); params.put("instanceId", "myinstance"); params.put("zookeepers", "zoo1,zoo2,zoo3"); params.put("user", "myuser"); params.put("password", "mypassword"); params.put("tableName", "mycatalog"); Query query = new Query("myfeature", ECQL.toFilter("BBOX(geom, -165,5,-50,75)")); GeoMesaInputFormat.configure(conf, params, query); Map<String, String> outParams = new HashMap<String, String>(); outParams.put("instanceId", "myinstance"); outParams.put("zookeepers", "zoo1,zoo2,zoo3"); outParams.put("user", "myuser"); outParams.put("password", "mypassword"); outParams.put("tableName", "mycatalog_2"); GeoMesaOutputFormat.configureDataStore(conf, outParams); JobClient.runJob(conf); }
From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java
License:Open Source License
public int initJob(JobConf jobConf, String input, String output) throws Exception { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); /**// w w w. jav a2s . co m * Need to get all of the sample names/labels */ JobConf cacheConf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); cacheConf.setJobName("CacheNorm2MapReduce"); cacheConf.setNumReduceTasks(1); // Want ONE part file // Set up IdentityMapper SequenceFileInputFormat.setInputPaths(cacheConf, new Path(input)); cacheConf.setInputFormat(SequenceFileInputFormat.class); cacheConf.setMapperClass(Norm2Mapper.class); cacheConf.setOutputKeyClass(StringDoublePairWritable.class); cacheConf.setOutputValueClass(SparseVectorWritable.class); // Set up IdentityReducer cacheConf.setReducerClass(IdentityReducer.class); cacheConf.setOutputFormat(SequenceFileOutputFormat.class); cacheConf.setNumReduceTasks(1); Path sfPath = FileUtils.createRemoteTempPath(fs, qInputPath.getParent()); LOG.info(String.format("Generating feature vector SequenceFile path %s", sfPath.toString())); SequenceFileOutputFormat.setOutputPath(cacheConf, sfPath); JobClient.runJob(cacheConf); Path cachePath = new Path(sfPath.toString() + Path.SEPARATOR + "part-00000"); // need to know the size (the reducer might be able to send this back via the Reporter, but how do we grab that info? StringDoublePairWritable key = new StringDoublePairWritable(); int size = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, cachePath, conf); boolean hasNext = reader.next(key); while (hasNext) { size += 1; hasNext = reader.next(key); } try { reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } //LOG.info(String.format("Caching model file %s", qInputPath.toString())); URI listURI = new URI(fs.makeQualified(cachePath).toString()); DistributedCache.addCacheFile(listURI, conf); LOG.info(String.format("SequenceFile cache path %s (%s) with %d labels", listURI.toString(), cachePath.getName(), size)); conf.set(CACHE_PATH, cachePath.getName()); conf.setInt(DISTANCE_MATRIX_SIZE, size); /** * Main MapReduce Task of generating dot products */ LOG.info("Generating distances"); JobConf distanceConf = new JobConf(conf, CalculateCosineDistanceMatrix.class); distanceConf.setJobName("DistanceMapReduce"); // Set up distance mapper SequenceFileInputFormat.setInputPaths(distanceConf, new Path(input)); distanceConf.setInputFormat(SequenceFileInputFormat.class); distanceConf.setMapperClass(DistanceMap.class); distanceConf.setMapOutputKeyClass(Text.class); distanceConf.setMapOutputValueClass(SparseVectorWritable.class); // Set up reducer to merge lower-triangle results into a single dense distance vector distanceConf.setReducerClass(DistanceReducer.class); distanceConf.setOutputKeyClass(Text.class); distanceConf.setOutputValueClass(DenseVectorWritable.class); distanceConf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(distanceConf, new Path(output)); JobClient.runJob(distanceConf); return 0; }
From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java
License:Open Source License
@Test public void testCombinerOutputClasses() throws IOException, KettleException { JobConf jobConf = createJobConf("./test-res/wordcount-mapper.ktr", "./test-res/wordcount-reducer.ktr", "./test-res/wordcount-reducer.ktr"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputValueClass(NullWritable.class); jobConf.setOutputValueClass(LongWritable.class); GenericTransCombiner combiner = new GenericTransCombiner(); combiner.configure(jobConf);// w w w . ja v a 2 s . com assertEquals(jobConf.getMapOutputKeyClass(), combiner.getOutClassK()); assertEquals(jobConf.getMapOutputValueClass(), combiner.getOutClassV()); }
From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java
License:Open Source License
@Test public void testReducerOutputClasses() throws IOException, KettleException { JobConf jobConf = createJobConf("./test-res/wordcount-mapper.ktr", "./test-res/wordcount-reducer.ktr", "./test-res/wordcount-reducer.ktr"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputValueClass(NullWritable.class); jobConf.setOutputValueClass(LongWritable.class); GenericTransReduce reducer = new GenericTransReduce(); reducer.configure(jobConf);/*from ww w. j a v a 2 s .c o m*/ assertEquals(jobConf.getOutputKeyClass(), reducer.getOutClassK()); assertEquals(jobConf.getOutputValueClass(), reducer.getOutClassV()); }