List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException
From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java
License:Apache License
/** * Override and extend this in implementations to add custom settings to the Job and Conf to * create lucene-based indexes that will point you at what splits contain values you are looking for. * You are on your own for filtering the splits appropriately before creating an MR job.. but * check out how this was done over MapFile-based indexes in * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat *///from w w w. jav a 2 s . com @Override protected void setupJob(Job job) { Configuration conf = job.getConfiguration(); conf.set("mapred.child.java.opts", "-Xmx4g"); List<String> fieldNames = Lists.newArrayList(); for (IndexedField field : getIndexedFields()) { fieldNames.add(field.getFieldName()); conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(), getExtractorClassName(field.getFieldName())); } conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {})); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(HadoopSplitDocument.class); job.setOutputFormatClass(NullOutputFormat.class); job.setInputFormatClass(getInputFormatClass()); job.setMapperClass(HadoopSplitIndexingMapper.class); job.setReducerClass(HadoopSplitIndexingReducer.class); }
From source file:com.twitter.elephanttwin.lucene.indexing.TextIndexingJob.java
License:Apache License
@Override protected void setupJob(Job job) { job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(TextIndexingMapper.class); job.setReducerClass(TextIndexingReducer.class); }
From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { params = new IndexConfig(); LOG.info(" - input: " + Joiner.on(" ").join(params.getInput())); LOG.info(" - output: " + IndexConfig.output.get()); Configuration conf = getConf(); Path outputDir = new Path(params.getOutput()); FileSystem fs = outputDir.getFileSystem(conf); fs.delete(outputDir, true);//from ww w . j a v a 2s.c o m int totalInputFiles = 0; List<FileStatus> stats = Lists.newArrayList(); for (String s : params.getInput()) { Path spath = new Path(IndexConfig.index.get() + s); HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter); } totalInputFiles = stats.size(); LOG.info(totalInputFiles + " total index files to be scanned"); conf.set(IndexScanMapper.searchColumnName, params.getColumnName()); Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(params.getOutput())); for (FileStatus file : stats) FileInputFormat.addInputPath(job, file.getPath()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); job.setMapperClass(IndexScanMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get()); BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(), params.getIndex(), (String) null); job.waitForCompletion(true); return 0; }
From source file:com.veera.secondarysort.demo2.SsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SsJob.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapOutputKeyClass(StockKey.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SsMapper.class); job.setReducerClass(SsReducer.class); job.waitForCompletion(true);//from ww w. j av a 2 s. co m return 0; }
From source file:com.wibidata.wibidota.DotaGatherExampleValues.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Gatherer Example Values"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(EnumGatherMap.class); job.setCombinerClass(AppendText.class); job.setReducerClass(EnumGatherReducer.class); job.setJarByClass(DotaGatherExampleValues.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {/* w w w. j ava2 s . c om*/ return -1; } }
From source file:com.wibidata.wibidota.dotaloader.DotaValuesCounter.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Value Counter"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(Map.class); job.setCombinerClass(Add.class); job.setReducerClass(Add.class); job.setJarByClass(DotaValuesCounter.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {//from ww w. j a v a 2s . com return -1; } }
From source file:com.wibidata.wibidota.DotaMaxAccountId.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Max Builder"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(DotaMaxAccountId.Map.class); job.setCombinerClass(DotaMaxAccountId.TakeMax.class); job.setReducerClass(DotaMaxAccountId.TakeMax.class); job.setJarByClass(DotaMaxAccountId.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {/*from ww w. ja v a 2 s. c om*/ return -1; } }
From source file:com.wipro.ats.bdre.dq.DQDriver.java
License:Apache License
@Override public int run(String[] arg) throws Exception { String processId = arg[0];//from www. ja va 2s. c o m String sPath = arg[1]; String destDir = arg[2]; Properties props = new GetProperties().getProperties(processId, "dq"); LOGGER.debug("props=" + props); Configuration conf = getConf(); conf.set("dq.process.id", processId); Job job = Job.getInstance(conf); job.setJobName("Data Quality " + processId); job.setJarByClass(DQDriver.class); job.setMapperClass(DQMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //Reducer is not required job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path inputFilePath = new Path(sPath); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir)); MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class, NullWritable.class); if (!job.waitForCompletion(true)) { return 1; } Path outputDir = new Path(destDir); FileSystem srcFs = outputDir.getFileSystem(getConf()); FileSystem destFs = outputDir.getFileSystem(getConf()); //Valid Records Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR); //Input and quality filtered file should have same name (but different path) Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName()); if (srcFs.exists(goodFilesSrcDir)) { FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, ""); } // Invalid Records Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR); Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE); if (srcFs.exists(badFilesSrcDir)) { FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, ""); } // Preparing report aggregation job Job fileReportAggregationJob = Job.getInstance(conf); fileReportAggregationJob.setJobName("File Report Computing " + processId); fileReportAggregationJob.setJarByClass(DQMain.class); fileReportAggregationJob.setMapperClass(DQFileReportMapper.class); fileReportAggregationJob.setMapOutputKeyClass(Text.class); fileReportAggregationJob.setMapOutputValueClass(IntWritable.class); fileReportAggregationJob.setReducerClass(DQFileReportReducer.class); fileReportAggregationJob.setOutputKeyClass(Text.class); fileReportAggregationJob.setOutputValueClass(Text.class); fileReportAggregationJob.setNumReduceTasks(1); Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR); Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir); FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir); if (!fileReportAggregationJob.waitForCompletion(true)) { return 1; } // Merge Report Records MR stuffs Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE); FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, ""); Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE); //Read the report file from HDFS and report the percentage DQStats dqStats = getQualityStats(getConf(), reportDestFile); LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent()); props = new GetProperties().getProperties(processId, "dq"); String strThreshold = props.getProperty("min.pass.threshold.percent"); float threshold = Float.parseFloat(strThreshold); dqStats.setThreshold(threshold); //Update the result in metadata logResult(dqStats, processId, 0L); if (dqStats.getGoodPercent() < threshold) { LOGGER.error("DQ check did not pass"); throw new DQValidationException(dqStats); } LOGGER.info(dqStats); FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile); String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString(); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash(fileHash); registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen()); registerFileInfo.setPath(goodDestFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java
License:Apache License
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); String dir = conf.get(LindenJobConfig.INPUT_DIR, null); logger.info("input dir:" + dir); Path inputPath = new Path(StringUtils.unEscapeString(dir)); Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR)); String indexPath = conf.get(LindenJobConfig.INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true);/*from www.j a va 2 s . co m*/ } if (fs.exists(new Path(indexPath))) { fs.delete(new Path(indexPath), true); } int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1); Shard[] shards = createShards(indexPath, numShards); Shard.setIndexShards(conf, shards); //empty trash; (new Trash(conf)).expunge(); Job job = Job.getInstance(conf, "linden-hadoop-indexing"); job.setJarByClass(LindenJob.class); job.setMapperClass(LindenMapper.class); job.setCombinerClass(LindenCombiner.class); job.setReducerClass(LindenReducer.class); job.setMapOutputKeyClass(Shard.class); job.setMapOutputValueClass(IntermediateForm.class); job.setOutputKeyClass(Shard.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(IndexUpdateOutputFormat.class); job.setReduceSpeculativeExecution(false); job.setNumReduceTasks(numShards); String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL); if (lindenSchemaFile == null) { throw new IOException("no schema file is found"); } logger.info("Adding schema file: " + lindenSchemaFile); job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema")); String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL); if (lindenPropertiesFile == null) { throw new IOException("no linden properties file is found"); } logger.info("Adding linden properties file: " + lindenPropertiesFile); job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties")); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); Path[] inputs = FileInputFormat.getInputPaths(job); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapreduce.input.dir = " + buffer.toString()); logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString()); logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS)); logger.info("mapreduce.input.format.class = " + job.getInputFormatClass()); logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass()); logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR)); job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed"); } return 0; }
From source file:com.xyz.reccommendation.driver.SKU2SKUCount.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); String envt = null;// ww w . ja va2 s .c om if (args.length > 0) { envt = args[0]; } else { envt = "dev"; } Properties prop = new Properties(); try { // load a properties file from class path, inside static method prop.load(SKU2SKUCount.class.getClassLoader().getResourceAsStream("config-" + envt + ".properties")); } catch (IOException ex) { ex.printStackTrace(); System.exit(1); } MongoConfigUtil.setOutputURI(conf, "mongodb://" + prop.getProperty("mongodb.ip") + "/" + prop.getProperty("mongodb.dbname") + ".out_stat_custom"); log.debug("MongoDB URL : mongodb://" + prop.getProperty("mongodb.ip") + "/" + prop.getProperty("mongodb.dbname") + "." + ".out_stat_custom"); log.debug("Conf: " + conf); MongoConfigUtil.setCreateInputSplits(conf, false); args = new GenericOptionsParser(conf, args).getRemainingArgs(); final Job job = new Job(conf, "Count the sku to sku mapping from pview data on hdfs in \"inputPview\" path."); job.setJarByClass(SKU2SKUCount.class); job.setMapperClass(TokenizerMapper.class); // job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BSONWritable.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); FileInputFormat.setInputPaths(job, new Path("inputPview")); System.exit(job.waitForCompletion(true) ? 0 : 1); }