List of usage examples for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass
@SuppressWarnings("unchecked") public static void setOutputFormatClass(Job job, Class<? extends OutputFormat> theClass)
From source file:com.baynote.kafka.hadoop.KafkaJobBuilder.java
License:Apache License
/** * Creates a {@link Job} based on how {@code this} {@link KafkaJobBuilder} has been configured. There are no * side-effects on {@code this} instance when you call this method, so you can call it multiple times. * // www . ja v a 2 s. com * @param conf * the job conf. * @return a fully configured {@link Job}. * @throws Exception error * @throws IllegalArgumentException * if any required parameters are not set. */ public Job configureJob(final Configuration conf) throws Exception { validateSettings(); final Job job = Job.getInstance(conf, getDefaultedJobName()); // set queue inputs if (getQueueMappers().size() == 1) { job.setInputFormatClass(KafkaInputFormat.class); final TopicConf topicConf = Iterables.getOnlyElement(getQueueMappers()); KafkaInputFormat.setTopic(job, topicConf.getTopic()); KafkaInputFormat.setConsumerGroup(job, topicConf.getConsumerGroup()); job.setMapperClass(topicConf.getMapper()); } else { job.setInputFormatClass(MultipleKafkaInputFormat.class); for (final TopicConf topicConf : getQueueMappers()) { MultipleKafkaInputFormat.addTopic(job, topicConf.getTopic(), topicConf.getConsumerGroup(), topicConf.getMapper()); } } if (getMapOutputKeyClass() != null) { job.setMapOutputKeyClass(getMapOutputKeyClass()); } if (getMapOutputValueClass() != null) { job.setMapOutputValueClass(getMapOutputValueClass()); } if (getReducerClass() == null) { job.setNumReduceTasks(0); } else { job.setReducerClass(getReducerClass()); job.setNumReduceTasks(getNumReduceTasks()); } if (getPartitionerClass() != null) { job.setPartitionerClass(getPartitionerClass()); } // set output job.setOutputFormatClass(getOutputFormatClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); if (getOutputFormat() == SupportedOutputFormat.TEXT_FILE) { TextOutputFormat.setOutputPath(job, getDefaultedOutputPath()); } else if (getOutputFormat() == SupportedOutputFormat.SEQUENCE_FILE) { SequenceFileOutputFormat.setOutputPath(job, getDefaultedOutputPath()); } if (usingS3()) { job.getConfiguration().set("fs.s3n.awsAccessKeyId", getS3AccessKey()); job.getConfiguration().set("fs.s3n.awsSecretAccessKey", getS3SecretyKey()); job.getConfiguration().set("fs.s3.awsAccessKeyId", getS3AccessKey()); job.getConfiguration().set("fs.s3.awsSecretAccessKey", getS3SecretyKey()); } if (isLazyOutputFormat()) { LazyOutputFormat.setOutputFormatClass(job, getOutputFormatClass()); } // setup kafka input format specifics KafkaInputFormat.setZkConnect(job, getZkConnect()); KafkaInputFormat.setKafkaFetchSizeBytes(job, getKafkaFetchSizeBytes()); job.setSpeculativeExecution(false); job.setJarByClass(getClass()); // memory settings for mappers if (!Strings.isNullOrEmpty(getTaskMemorySettings())) { job.getConfiguration().set("mapred.child.java.opts", getTaskMemorySettings()); } return job; }
From source file:com.bizosys.hsearch.kv.indexer.KVIndexer.java
License:Apache License
private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output, int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException { int jobStatus = -1; switch (jobTypeI) { case SF2HB: { IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and output folder " + output); FileInputFormat.addInputPath(job, new Path(input)); job.setMapperClass(KVMapperFile.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setReducerClass(KVReducerHBase.class); TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; }/* ww w . j a va 2 s .c o m*/ case SF2MF: { IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and output folder " + output); FileInputFormat.addInputPath(job, new Path(input)); job.setMapperClass(KVMapperFile.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case SF2HF: { /* * First creates map file and then convert to hfile. * create intermediate dir for map file output * */ String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("SF2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case HB2HB: { if (fm.tableName.equals(input)) { throw new IOException("Input table and index table can not be same"); } Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); scan.addFamily(fm.familyName.getBytes()); if (null != filter) { if (filter.trim().length() > 0) { int index = filter.indexOf('='); scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(), filter.substring(0, index).getBytes(), CompareOp.EQUAL, filter.substring(index + 1).getBytes())); } } TableMapReduceUtil.initTableMapperJob(input, // input table scan, // Scan instance to control CF and attribute selection KVMapperHBase.class, // mapper class Text.class, // mapper output key BytesWritable.class, // mapper output value job); TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table KVReducerHBase.class, // reducer class job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case HB2HF: { String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("HB2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case HB2MF: { if (fm.tableName.equals(input)) { throw new IOException("Input table and index table can not be same"); } Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); scan.addFamily(fm.familyName.getBytes()); if (null != filter) { if (filter.trim().length() > 0) { int index = filter.indexOf('='); scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(), filter.substring(0, index).getBytes(), CompareOp.EQUAL, filter.substring(index + 1).getBytes())); } } TableMapReduceUtil.initTableMapperJob(input, // input table scan, // Scan instance to control CF and attribute selection KVMapperHBase.class, // mapper class Text.class, // mapper output key BytesWritable.class, // mapper output value job); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case IMF2HF: { Path finalOutputDir = new Path(output); job.setJarByClass(KVIndexer.class); job.setMapperClass(KVMapperHFile.class); job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, finalOutputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); HTable hTable = new HTable(job.getConfiguration(), fm.tableName); HFileOutputFormat.configureIncrementalLoad(job, hTable); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } default: throw new IOException("Invalid Jobtype " + jobTypeI); } }
From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java
License:Apache License
private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output, int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException { int jobStatus = -1; switch (jobTypeI) { case SF2HB: { IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and output folder " + output); FileInputFormat.addInputPath(job, new Path(input)); job.setMapperClass(KVMapperFile.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(KVReducerHBase.class); TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; }/*w ww . j a v a 2s . c o m*/ case SF2HF: { //First creates map file and then convert to hfile. //create intermediate dir for map file output String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("SF2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case SF2MF: { IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and output folder " + output); FileInputFormat.addInputPath(job, new Path(input)); job.setMapperClass(KVMapperFile.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); job.setSortComparatorClass(TextPair.FirstComparator.class); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ImmutableBytesWritable.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case MF2HB: { job.setMapperClass(KVMapperMapFile.class); job.setInputFormatClass(SequenceFileAsTextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); SequenceFileAsTextInputFormat.addInputPath(job, new Path(input)); job.setReducerClass(KVReducerHBase.class); TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case MF2HF: { String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("MF2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case MF2MF: { job.setMapperClass(KVMapperMapFile.class); job.setInputFormatClass(SequenceFileAsTextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); SequenceFileAsTextInputFormat.addInputPath(job, new Path(input)); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ImmutableBytesWritable.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case HB2HB: { if (fm.tableName.equals(input)) { throw new IOException("Input table and index table can not be same"); } Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); scan.addFamily(fm.familyName.getBytes()); if (null != filter) { if (filter.trim().length() > 0) { int index = filter.indexOf('='); scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(), filter.substring(0, index).getBytes(), CompareOp.EQUAL, filter.substring(index + 1).getBytes())); } } TableMapReduceUtil.initTableMapperJob(input, // input table scan, // Scan instance to control CF and attribute selection KVMapperHBase.class, // mapper class TextPair.class, // mapper output key Text.class, // mapper output value job); TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table KVReducerHBase.class, // reducer class job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case HB2HF: { String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("HB2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case HB2MF: { if (fm.tableName.equals(input)) { throw new IOException("Input table and index table can not be same"); } Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); scan.addFamily(fm.familyName.getBytes()); if (null != filter) { if (filter.trim().length() > 0) { int index = filter.indexOf('='); scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(), filter.substring(0, index).getBytes(), CompareOp.EQUAL, filter.substring(index + 1).getBytes())); } } TableMapReduceUtil.initTableMapperJob(input, // input table scan, // Scan instance to control CF and attribute selection KVMapperHBase.class, // mapper class TextPair.class, // mapper output key Text.class, // mapper output value job); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ImmutableBytesWritable.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case IMF2HF: { Path finalOutputDir = new Path(output); job.setJarByClass(KVIndexer.class); job.setMapperClass(KVMapperHFile.class); job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, finalOutputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); HTable hTable = new HTable(job.getConfiguration(), fm.tableName); HFileOutputFormat.configureIncrementalLoad(job, hTable); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } default: throw new IOException("Invalid Jobtype " + jobTypeI); } }
From source file:com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name);/*from ww w.j a v a2 s. c o m*/ job.setJarByClass(DerivedColumnTransformationPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Input Path String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(",")) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Topk path String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH); LOGGER.info("Topk path : " + topkPath); // Output path Path outputPath = new Path( getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode()); // New schema Schema outputSchema = newSchema(thirdeyeConfig); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString()); // Map config job.setMapperClass(DerivedColumnTransformationPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, outputSchema); LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class); AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema); job.setNumReduceTasks(0); job.waitForCompletion(true); return job; }
From source file:com.pagerankcalculator.TwitterPageRank.java
/** * Graph Parsing/* w w w.ja v a 2 s . c o m*/ * Memasukan data mentah dan melakukan inisialisasi pagerank * * @param in file data masukan * @param out direktori output */ public int parseGraph(String in, String out) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#1 Parsing Graph"); job.setJarByClass(TwitterPageRank.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(GraphParsingMapper.class); job.setReducerClass(GraphParsingReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(TwitterPageRank.NUM_REDUCE_TASKS); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); Path inputFilePath = new Path(in); Path outputFilePath = new Path(out); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, outputFilePath); FileSystem fs = FileSystem.newInstance(getConf()); if (fs.exists(outputFilePath)) { fs.delete(outputFilePath, true); } return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.pagerankcalculator.TwitterPageRank.java
public int calculatePagerank(String in, String out, int iteration) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#2 Iteration-" + iteration + " Calculating Page Rank"); job.setJarByClass(TwitterPageRank.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(PageRankCalculationMapper.class); job.setReducerClass(PageRankCalculationReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(TwitterPageRank.NUM_REDUCE_TASKS); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); Path inputFilePath = new Path(in); Path outputFilePath = new Path(out); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, outputFilePath); FileSystem fs = FileSystem.newInstance(getConf()); if (fs.exists(outputFilePath)) { fs.delete(outputFilePath, true); }/*from w w w. ja va 2 s . c o m*/ return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.pagerankcalculator.TwitterPageRank.java
public int sortPagerank(String in, String out) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#3 Sorting Page Rank"); job.setJarByClass(TwitterPageRank.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(PageRankSortingMapper.class); job.setReducerClass(PageRankSortingReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(1);//from w w w . j av a 2 s. c o m LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); job.setSortComparatorClass(DoubleSortDescComparator.class); Path inputFilePath = new Path(in); Path outputFilePath = new Path(out); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, outputFilePath); FileSystem fs = FileSystem.newInstance(getConf()); if (fs.exists(outputFilePath)) { fs.delete(outputFilePath, true); } return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.rw.legion.DefaultJob.java
License:Apache License
/** * Main method.//from w w w . j a va 2s . co m * * @param args Arguments should be: 1) input path, 2) output path, 3) * location of Legion objective file. */ public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); // Load the Legion objective from the JSON doc. Path path = new Path(args[2]); FileSystem fs = FileSystem.get(new URI(args[2]), conf); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String json = ""; String line = br.readLine(); while (line != null) { json += line; line = br.readLine(); } br.close(); /* * Save the JSON for the Legion objective to the Hadoop configuration, * so we can access it in other containers. */ conf.setStrings("legion_objective", json); // De-serialize the objective so we can access the settings here. LegionObjective legionObjective = ObjectiveDeserializer.deserialize(json); // Start configuring the MapReduce job. Job hadoopJob = Job.getInstance(conf, "Legion"); hadoopJob.setJarByClass(DefaultJob.class); hadoopJob.setMapperClass(DefaultMapper.class); LazyOutputFormat.setOutputFormatClass(hadoopJob, TextOutputFormat.class); // Compress the output to speed things up. TextOutputFormat.setCompressOutput(hadoopJob, true); TextOutputFormat.setOutputCompressorClass(hadoopJob, GzipCodec.class); // What input format do we use? try { @SuppressWarnings("unchecked") Class<? extends FileInputFormat<NullWritable, LegionRecord>> inputClass = (Class<? extends FileInputFormat<NullWritable, LegionRecord>>) Class .forName(legionObjective.getInputFormat()); hadoopJob.setInputFormatClass(inputClass); } catch (Exception e) { throw new JsonParseException( "Problem loading input format " + "class '" + legionObjective.getInputFormat() + "'"); } // Should we set a max combined size? if (legionObjective.getMaxCombinedSize() != null) { CombineFileInputFormat.setMaxInputSplitSize(hadoopJob, legionObjective.getMaxCombinedSize()); } /* * These are just static convenience methods, so it doesn't matter if * they come from the wrong class. */ FileInputFormat.setInputDirRecursive(hadoopJob, true); FileInputFormat.addInputPath(hadoopJob, new Path(args[0])); FileOutputFormat.setOutputPath(hadoopJob, new Path(args[1])); // Since a Legion objective can specify multiple output tables. for (OutputTable outputTable : legionObjective.getOutputTables()) { MultipleOutputs.addNamedOutput(hadoopJob, outputTable.getTitle(), TextOutputFormat.class, NullWritable.class, Text.class); } MultipleOutputs.addNamedOutput(hadoopJob, "skipped", TextOutputFormat.class, NullWritable.class, Text.class); hadoopJob.waitForCompletion(true); }
From source file:com.twitter.algebra.nmf.ColPartitionJob.java
License:Apache License
/** * Partition A on columns, where A refers to the path that contain a matrix in * {@link SequenceFileInputFormat}. Refer to {@link ColPartitionJob} for * further details.// ww w. j av a 2s.c o m * * @param conf the initial configuration * @param matrixInputPath the path to the input matrix A * @param matrixOutputPath the path of the resulting partitioned matrix * @param numInputRows rows * @param numInputCols cols * @param numColPartitions the hint for the desired number of column * partitions * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols, int numColPartitions) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "colpartition"); int colPartSize = getColPartitionSize(numInputCols, numColPartitions); numColPartitions = (int) Math.ceil(numInputCols / (double) colPartSize); if (numReducers < numColPartitions) numReducers = numColPartitions; NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "colpartition"); conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows); conf.setInt(NUM_ORIG_COLS_KEY, numInputCols); conf.setInt(NUM_COL_PARTITIONS, numColPartitions); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ColPartitionJob.class); job.setJobName(ColPartitionJob.class.getSimpleName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(ElementWritable.class); job.setMapOutputValueClass(VectorWritable.class); RowColPartitioner.setPartitioner(job, RowColPartitioner.ElementRowColPartitioner.class, numInputRows, numInputCols, numColPartitions); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(numReducers); // job.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); return job; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.ConfigurationHelper.java
License:Apache License
/** * Job configurator//from w w w . j a v a2 s . c o m * * @param job job instance * @param jarByClass class of the jar * @param mapperClass mapper * @param reducerClass reducer * @param commaSeparatedInputFiles input paths * @param outputPath output * @throws IOException I/O exception */ public static void configureJob(Job job, Class<?> jarByClass, Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass, String commaSeparatedInputFiles, String outputPath) throws IOException { job.setJarByClass(jarByClass); job.setJobName(jarByClass.getName()); // mapper job.setMapperClass(mapperClass); // reducer job.setReducerClass(reducerClass); // input-output is warc job.setInputFormatClass(WARCInputFormat.class); // prevent producing empty files LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class); // intermediate data job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(WARCWritable.class); // output data job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); // set output compression to GZip FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); }