List of usage examples for org.apache.hadoop.mapred JobConf setJobName
public void setJobName(String name)
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.printf("Usage: %s [generic options] <input> <output> <schema-file>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }//from www.j a v a 2 s .co m String input = args[0]; String output = args[1]; String schemaFile = args[2]; JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Avro projection"); FileInputFormat.addInputPath(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); Schema schema = new Schema.Parser().parse(new File(schemaFile)); AvroJob.setInputSchema(conf, schema); AvroJob.setMapOutputSchema(conf, schema); AvroJob.setOutputSchema(conf, schema); conf.setNumReduceTasks(0); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.printf("Usage: %s [generic options] <input> <output> <schema-file>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from w w w. ja va 2 s . c o m*/ String input = args[0]; String output = args[1]; String schemaFile = args[2]; JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Avro sort"); FileInputFormat.addInputPath(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); Schema schema = new Schema.Parser().parse(new File(schemaFile)); AvroJob.setInputSchema(conf, schema); Schema intermediateSchema = Pair.getPairSchema(schema, schema); AvroJob.setMapOutputSchema(conf, intermediateSchema); AvroJob.setOutputSchema(conf, schema); AvroJob.setMapperClass(conf, SortMapper.class); AvroJob.setReducerClass(conf, SortReducer.class); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from w w w .java2 s .c o m*/ JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); AvroJob.setInputSchema(conf, Schema.create(Schema.Type.STRING)); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.INT), WeatherRecord.SCHEMA$)); AvroJob.setOutputSchema(conf, WeatherRecord.SCHEMA$); conf.setInputFormat(AvroUtf8InputFormat.class); AvroJob.setMapperClass(conf, MaxTemperatureMapper.class); AvroJob.setCombinerClass(conf, MaxTemperatureCombiner.class); AvroJob.setReducerClass(conf, MaxTemperatureReducer.class); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { JobBuilder.printUsage(this, "<ncdc input> <metoffice input> <output>"); return -1; }/*from w w w . j a v a 2 s. c o m*/ JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature with multiple input formats"); Path ncdcInputPath = new Path(args[0]); Path metOfficeInputPath = new Path(args[1]); Path outputPath = new Path(args[2]); MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, MaxTemperatureMapper.class); MultipleInputs.addInputPath(conf, metOfficeInputPath, TextInputFormat.class, MetOfficeMaxTemperatureMapper.class); FileOutputFormat.setOutputPath(conf, outputPath); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setCombinerClass(MaxTemperatureReducer.class); conf.setReducerClass(MaxTemperatureReducer.class); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { JobBuilder.printUsage(this, "<ncdc input> <station input> <output>"); return -1; }//from w w w .ja va 2 s . co m JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Join record with station name"); Path ncdcInputPath = new Path(args[0]); Path stationInputPath = new Path(args[1]); Path outputPath = new Path(args[2]); MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class); MultipleInputs.addInputPath(conf, stationInputPath, TextInputFormat.class, JoinStationMapper.class); FileOutputFormat.setOutputPath(conf, outputPath); /*[*/conf.setPartitionerClass(KeyPartitioner.class); conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);/*]*/ conf.setMapOutputKeyClass(TextPair.class); conf.setReducerClass(JoinReducer.class); conf.setOutputKeyClass(Text.class); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); Path inputDir = new Path("widgets"); FileInputFormat.addInputPath(conf, inputDir); FileOutputFormat.setOutputPath(conf, new Path("maxwidget")); Schema schema = readSchema(inputDir, conf); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(AvroOutputFormat.class); AvroJob.setInputSchema(conf, schema); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.LONG), schema)); AvroJob.setOutputSchema(conf, schema); AvroJob.setMapperClass(conf, MaxWidgetMapper.class); AvroJob.setReducerClass(conf, MaxWidgetReducer.class); conf.setNumReduceTasks(1);/*from w w w . jav a 2 s . com*/ JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
/** * Create a JobConf for a Job that will calculate the number of unique listeners per track. * //from ww w . ja v a 2 s .c om * @param inputDir The path to the folder containing the raw listening data files. * @return The unique listeners JobConf. */ private JobConf getUniqueListenersJobConf(Path inputDir) { log.info("Creating configuration for unique listeners Job"); // output results to a temporary intermediate folder, this will get deleted by start() method Path uniqueListenersOutput = new Path("uniqueListeners"); JobConf conf = new JobConf(TrackStatisticsProgram.class); conf.setOutputKeyClass(IntWritable.class); // track id conf.setOutputValueClass(IntWritable.class); // number of unique listeners conf.setInputFormat(TextInputFormat.class); // raw listening data conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(UniqueListenersMapper.class); conf.setCombinerClass(UniqueListenersCombiner.class); conf.setReducerClass(UniqueListenersReducer.class); FileInputFormat.addInputPath(conf, inputDir); FileOutputFormat.setOutputPath(conf, uniqueListenersOutput); conf.setJobName("uniqueListeners"); return conf; }
From source file:crunch.MaxTemperature.java
License:Apache License
/** * Creates a JobConf for a Job that will sum up the TrackStatistics per track. * //from ww w. j a va 2 s.c om * @param inputDir The path to the folder containing the raw input data files. * @return The sum JobConf. */ private JobConf getSumJobConf(Path inputDir) { log.info("Creating configuration for sum job"); // output results to a temporary intermediate folder, this will get deleted by start() method Path playsOutput = new Path("sum"); JobConf conf = new JobConf(TrackStatisticsProgram.class); conf.setOutputKeyClass(IntWritable.class); // track id conf.setOutputValueClass(TrackStats.class); // statistics for a track conf.setInputFormat(TextInputFormat.class); // raw listening data conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(SumMapper.class); conf.setCombinerClass(SumReducer.class); conf.setReducerClass(SumReducer.class); FileInputFormat.addInputPath(conf, inputDir); FileOutputFormat.setOutputPath(conf, playsOutput); conf.setJobName("sum"); return conf; }
From source file:crunch.MaxTemperature.java
License:Apache License
/** * Creates a JobConf for a Job that will merge the unique listeners and track statistics. * //from w ww .j a va 2s . co m * @param outputPath The path for the results to be output to. * @param sumInputDir The path containing the data from the sum Job. * @param listenersInputDir The path containing the data from the unique listeners job. * @return The merge JobConf. */ private JobConf getMergeConf(Path outputPath, Path sumInputDir, Path listenersInputDir) { log.info("Creating configuration for merge job"); JobConf conf = new JobConf(TrackStatisticsProgram.class); conf.setOutputKeyClass(IntWritable.class); // track id conf.setOutputValueClass(TrackStats.class); // overall track statistics conf.setCombinerClass(SumReducer.class); // safe to re-use reducer as a combiner here conf.setReducerClass(SumReducer.class); conf.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(conf, outputPath); MultipleInputs.addInputPath(conf, sumInputDir, SequenceFileInputFormat.class, IdentityMapper.class); MultipleInputs.addInputPath(conf, listenersInputDir, SequenceFileInputFormat.class, MergeListenersMapper.class); conf.setJobName("merge"); return conf; }
From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java
License:Mozilla Public License
/** Starts the MapReduce indexing. * @param args/*w ww . j av a2 s . co m*/ * @throws Exception */ public int run(String[] args) throws Exception { long time = System.currentTimeMillis(); // For the moment: Hard-code the terrier home to quick test System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer"); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return 0; } else if (args.length == 0) { logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } /*else { logger.fatal(usage()); return 0; }*/ if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return 0; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJarByClass(TerrierIndexing.class); conf.setJobName("StreamCorpusIndexer: Terrier Indexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return 0; } // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; boolean blockIndexing = true; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); // not sure if this is effective in YARN conf.setNumMapTasks(2000); // increase the heap usage conf.set("mapreduce.map.memory.mb", "6100"); conf.set("mapred.job.map.memory.mb", "6100"); conf.set("mapreduce.reduce.memory.mb", "6144"); conf.set("mapred.job.reduce.memory.mb", "6144"); conf.set("mapreduce.map.java.opts", "-Xmx6100m"); conf.set("mapred.map.child.java.opts", "-Xmx6100m"); conf.set("mapreduce.reduce.java.opts", "-Xmx6144m"); conf.set("mapred.reduce.child.opts", "-Xmx6144m"); //conf.setBoolean("mapred.used.genericoptionsparser", true) ; // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it conf.set("mapreduce.job.user.classpath.first", "true"); // increase the yarn memory to 10 GB conf.set("yarn.nodemanager.resource.memory-mb", "12288"); conf.set("yarn.nodemanager.resource.cpu-vcores", "16"); conf.set("yarn.scheduler.minimum-allocation-mb", "4096"); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } /*JobID jobId = null; boolean ranOK = true; try{ RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); e.printStackTrace(); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } */ //if (ranOK) //{ System.out.println("Merging indices"); if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); //} System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); return 0; }