List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass)
From source file:com.scaleoutsoftware.soss.hserver.Test_MapToMapCopyMapred.java
License:Apache License
public int run(String[] args) throws Exception { final NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap("mapr-i", new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class)); final NamedMap<IntWritable, Text> outputMap = NamedMapFactory.getMap("mapr-o", new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class)); inputMap.clear();/*from w ww . j a v a 2 s . c o m*/ outputMap.clear(); Thread.sleep(15000); BulkLoader<IntWritable, Text> put = inputMap.getBulkLoader(); String content = "xcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; Text contentW = new Text(content); IntWritable count = new IntWritable(); int expectedSize = 10000; for (int i = 0; i < expectedSize; i++) { count.set(i); put.put(count, contentW); } put.close(); InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis()) .addClass(Test_MapToMapCopyMapred.class).load(); JobConf configuration = new JobConf(getConf(), Test_MapToMapCopyMapred.class); configuration.setInt("mapred.hserver.setting.reducer.usememorymappedfiles", 0); configuration.setMapOutputKeyClass(IntWritable.class); configuration.setMapOutputValueClass(Text.class); configuration.setOutputKeyClass(IntWritable.class); configuration.setOutputValueClass(Text.class); configuration.setInputFormat(NamedMapInputFormatMapred.class); configuration.setOutputFormat(NamedMapOutputFormatMapred.class); NamedMapInputFormatMapred.setNamedMap(configuration, inputMap); NamedMapOutputFormatMapred.setNamedMap(configuration, outputMap); assertEquals(inputMap.size(), outputMap.size() + expectedSize); // should be 0 + expected HServerJobClient.runJob(configuration, false, grid); assertEquals(inputMap.size(), outputMap.size()); inputMap.clear(); outputMap.clear(); grid.unload(); return 1; }
From source file:com.spotify.hdfs2cass.BulkLoader.java
License:Apache License
public int run(String[] args) throws Exception { CommandLine cmdLine = parseOptions(args); String[] inputPaths = cmdLine.getOptionValues('i'); String seedNodeHost = cmdLine.getOptionValue('h'); String seedNodePort = cmdLine.getOptionValue('p', "9160"); String keyspace = cmdLine.getOptionValue('k'); String colfamily = cmdLine.getOptionValue('c'); int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0")); Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0")); String poolName = cmdLine.getOptionValue("pool"); ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort); clusterInfo.init(keyspace);/* w ww .j a va 2s .com*/ final String partitionerClass = clusterInfo.getPartitionerClass(); final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")), clusterInfo.getNumClusterNodes()); Configuration conf = new Configuration(); ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily); ConfigHelper.setOutputInitialAddress(conf, seedNodeHost); ConfigHelper.setOutputRpcPort(conf, seedNodePort); ConfigHelper.setOutputPartitioner(conf, partitionerClass); if (cmdLine.hasOption('s')) { conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32")); } if (cmdLine.hasOption('M')) { conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M')); } if (cmdLine.hasOption('C')) { ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C')); } if (cmdLine.hasOption('b')) { conf.setBoolean("com.spotify.hdfs2cass.base64", true); } JobConf job = new JobConf(conf); if (mappers > 0) job.setNumMapTasks(mappers); if (reducers > 0) job.setNumReduceTasks(reducers); if (copiers > 0) job.set("mapred.reduce.parallel.copies", copiers.toString()); if (poolName != null) job.set("mapred.fairscheduler.pool", poolName); // set the nodes as a param for the other hadoop nodes clusterInfo.setConf(job); String jobName = "bulkloader-hdfs-to-cassandra"; if (cmdLine.hasOption('n')) jobName += "-" + cmdLine.getOptionValue('n'); job.setJobName(jobName); job.setJarByClass(BulkLoader.class); job.setInputFormat(AvroAsTextInputFormat.class); for (String inputPath : inputPaths) { FileInputFormat.addInputPath(job, new Path(inputPath)); } //map just outputs text, reduce sends to cassandra job.setMapperClass(MapToText.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(CassandraPartitioner.class); job.setReducerClass(ReduceTextToCassandra.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); if (cmdLine.hasOption('s')) job.setOutputFormat(BulkOutputFormat.class); else job.setOutputFormat(ColumnFamilyOutputFormat.class); JobClient.runJob(job); return 0; }
From source file:com.vsii.ttxvn.crawling.DeleteFailedDataJob.java
License:Apache License
public int run(String[] args) throws IOException { if (args.length < 1) { System.err.println("Usage: DeleteFailedDataJob <crawldb>"); return 1; }//from ww w . j a va 2 s. co m String crawldb = args[0]; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("DeleteFailedDataJob: starting at " + sdf.format(start)); Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(getConf()); job.setJobName("DeleteFailedData on " + crawldb); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(DBFilter.class); job.setReducerClass(DedupReducer.class); try { RunningJob rj = JobClient.runJob(job); Group g = rj.getCounters().getGroup("DeleteFailedDataJobStatus"); if (g != null) { long dups = g.getCounter("Documents marked as duplicate"); LOG.info("DeleteFailedData: " + (int) dups + " documents marked as duplicates"); } } catch (final Exception e) { LOG.error("DeleteFailedDataJob: " + StringUtils.stringifyException(e)); return -1; } // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("DeleteFailedData: Updating status of duplicate urls into crawl db."); } Path dbPath = new Path(crawldb); JobConf mergeJob = CrawlDb.createJob(getConf(), dbPath); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(StatusUpdateReducer.class); try { JobClient.runJob(mergeJob); } catch (final Exception e) { LOG.error("DeleteFailedDataMergeJob: " + StringUtils.stringifyException(e)); return -1; } CrawlDb.install(mergeJob, dbPath); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("DeleteFailedData finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Extracts redirects and the target for each. * * @param inputPath/*from ww w . j a v a 2 s . c om*/ * @param outputPath * @throws IOException */ private void task0(String inputPath, String outputPath) throws IOException { LOG.info("Extracting redirects (phase 0)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper0.class); conf.setReducerClass(IdentityReducer.class); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Maps from Wikipedia article to (srcID, (targetID, anchor). * * @param inputPath//from w w w. j a va 2 s .com * @param outputPath * @throws IOException */ private void task1(String inputPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. conf.setNumReduceTasks(10); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfStringInt.class); conf.setMapOutputValueClass(PairOfStrings.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfStrings.class); conf.setMapperClass(MyMapper1.class); conf.setReducerClass(MyReducer1.class); conf.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * * Maps from (srcID, (targetID, anchor) to (targetID, (anchor, count)). * * @param inputPath//w w w. j av a 2s . c o m * @param outputPath * @throws IOException */ private void task2(String inputPath, String outputPath, String redirPath) throws IOException { LOG.info("Extracting anchor text (phase 2)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Random r = new Random(); //String tmpOutput = "tmp-" + this.getClass().getCanonicalName() + "-" + r.nextInt(10000); //LOG.info( "intermediate folder for merge " + tmpOutput ); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath)); // Gathers everything together for convenience; feasible for Wikipedia. conf.setNumReduceTasks(1); try { DistributedCache.addCacheFile(new URI(redirPath + "/part-00000" + "#" + "redirs.dat"), conf); DistributedCache.createSymlink(conf); } catch (URISyntaxException e) { e.printStackTrace(); } FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); //FileOutputFormat.setOutputPath(conf, new Path(tmpOutput)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(HMapSIW.class); conf.setMapperClass(MyMapper2.class); conf.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); // Clean up intermediate data. FileSystem.get(conf).delete(new Path(inputPath), true); /* //merge String finalO = outputPath+"/part-00000/data"; FileSystem.get(conf).mkdirs( new Path( outputPath + "part-00000") ); getMergeInHdfs( tmpOutput, finalO, conf ); FileSystem.get(conf).delete(new Path(tmpOutput), true); */ }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Extracts CF for each found anchor./*from w ww .j av a 2 s .c om*/ * * @param inputPath * @param mapPath * @param outputPath * @throws IOException */ private void task3(String inputPath, String mapPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 3)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); LOG.info(" - mapping: " + mapPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); String location = "map.dat"; try { DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf); //DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf); DistributedCache.createSymlink(conf); } catch (URISyntaxException e) { e.printStackTrace(); } FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper3.class); conf.setCombinerClass(MyReducer3.class); conf.setReducerClass(MyReducer3.class); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)). * * @param inputPath/*from w w w . j a v a2s . c om*/ * @param outputPath * @throws IOException */ private void task4(String inputPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 4)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); //FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data")); FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data")); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(HMapSIW.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(HMapSIW.class); conf.setMapperClass(MyMapper4.class); conf.setReducerClass(MyReducer4.class); JobClient.runJob(conf); }
From source file:com.zjy.mongo.util.MongoTool.java
License:Apache License
private int runMapredJob(final Configuration conf) { final JobConf job = new JobConf(conf, getClass()); /**//ww w. j av a2 s. c om * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends org.apache.hadoop.mapred.Mapper> mapper = MapredMongoConfigUtil.getMapper(conf); if (LOG.isDebugEnabled()) { LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MapredMongoConfigUtil.INPUT_URI)); } job.setMapperClass(mapper); Class<? extends org.apache.hadoop.mapred.Reducer> combiner = MapredMongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MapredMongoConfigUtil.getReducer(conf)); job.setOutputFormat(MapredMongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MapredMongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MapredMongoConfigUtil.getOutputValue(conf)); job.setInputFormat(MapredMongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MapredMongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MapredMongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MapredMongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MapredMongoConfigUtil.isJobBackground(conf); try { RunningJob runningJob = JobClient.runJob(job); if (background) { LOG.info("Setting up and running MapReduce job in background."); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); runningJob.waitForCompletion(); return 0; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:contrail.stages.GraphToFasta.java
License:Open Source License
@Override public RunningJob runJob() throws Exception { String inputPath = (String) stage_options.get("inputpath"); String outputPath = (String) stage_options.get("outputpath"); sLogger.info(" - inputpath: " + inputPath); sLogger.info(" - outputpath: " + outputPath); JobConf conf = new JobConf(GraphToFasta.class); AvroJob.setInputSchema(conf, GraphNodeData.SCHEMA$); initializeJobConfiguration(conf);/* w w w .j a v a 2 s . co m*/ FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); AvroInputFormat<GraphNodeData> input_format = new AvroInputFormat<GraphNodeData>(); conf.setInputFormat(input_format.getClass()); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); // Make it mapper only. conf.setNumReduceTasks(0); conf.setMapperClass(GraphToFastqMapper.class); if (stage_options.containsKey("writeconfig")) { writeJobConfig(conf); } else { // Delete the output directory if it exists already Path out_path = new Path(outputPath); if (FileSystem.get(conf).exists(out_path)) { // TODO(jlewi): We should only delete an existing directory // if explicitly told to do so. sLogger.info("Deleting output path: " + out_path.toString() + " " + "because it already exists."); FileSystem.get(conf).delete(out_path, true); } long starttime = System.currentTimeMillis(); RunningJob result = JobClient.runJob(conf); long endtime = System.currentTimeMillis(); float diff = (float) ((endtime - starttime) / 1000.0); System.out.println("Runtime: " + diff + " s"); return result; } return null; }