List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.twitter.scalding.parquet.scrooge.TestCorruptScroogeRecords.java
License:Apache License
@Override public void setupJob(Job job, Path path) throws Exception { job.setInputFormatClass(ParquetScroogeInputFormat.class); ParquetScroogeInputFormat.setInputPaths(job, path); ParquetScroogeInputFormat.setThriftClass(job.getConfiguration(), StructWithUnionV2.class); ThriftReadSupport.setRecordConverterClass(job.getConfiguration(), ScroogeRecordConverter.class); job.setMapperClass(ReadMapper.class); job.setNumReduceTasks(0);/*from ww w . j a va 2 s.co m*/ job.setOutputFormatClass(NullOutputFormat.class); }
From source file:com.veera.secondarysort.demo2.SsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SsJob.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapOutputKeyClass(StockKey.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SsMapper.class); job.setReducerClass(SsReducer.class); job.waitForCompletion(true);/*w ww . ja v a 2 s . c om*/ return 0; }
From source file:com.wibidata.wibidota.DotaGatherExampleValues.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Gatherer Example Values"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(EnumGatherMap.class); job.setCombinerClass(AppendText.class); job.setReducerClass(EnumGatherReducer.class); job.setJarByClass(DotaGatherExampleValues.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {//from w w w. ja va2 s. c o m return -1; } }
From source file:com.wibidata.wibidota.dotaloader.DotaValuesCounter.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Value Counter"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(Map.class); job.setCombinerClass(Add.class); job.setReducerClass(Add.class); job.setJarByClass(DotaValuesCounter.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {/* ww w . j a va 2 s . com*/ return -1; } }
From source file:com.wibidata.wibidota.DotaMaxAccountId.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Max Builder"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(DotaMaxAccountId.Map.class); job.setCombinerClass(DotaMaxAccountId.TakeMax.class); job.setReducerClass(DotaMaxAccountId.TakeMax.class); job.setJarByClass(DotaMaxAccountId.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {//from www.jav a 2s . c om return -1; } }
From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBQueryInputFormat.java
License:Apache License
public static void setInput(Job job, Class<? extends DynamoDBKeyWritable> inputClass, String tableName) { job.setInputFormatClass(DynamoDBQueryInputFormat.class); DynamoDBConfiguration dbConf = new DynamoDBConfiguration(job.getConfiguration()); dbConf.setInputClass(inputClass);/* w w w . j a va2 s . co m*/ dbConf.setInputTableName(tableName); }
From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBScanInputFormat.java
License:Apache License
public static void setInput(Job job, Class<? extends DynamoDBKeyWritable> inputClass, String tableName) { job.setInputFormatClass(DynamoDBScanInputFormat.class); DynamoDBConfiguration dbConf = new DynamoDBConfiguration(job.getConfiguration()); dbConf.setInputClass(inputClass);/* ww w .ja va2 s . co m*/ dbConf.setInputTableName(tableName); }
From source file:com.wipro.ats.bdre.datagen.mr.Driver.java
License:Apache License
/** * @param args the cli arguments//from ww w . j a v a 2 s .c om */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); GetGeneralConfig generalConfig = new GetGeneralConfig(); GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name"); conf.set("fs.defaultFS", gc.getDefaultVal()); String processId = args[0]; Path outputDir = new Path(ResolvePath.replaceVars(args[1])); Properties dataProps = Config.getDataProperties(processId); Properties tableProps = Config.getTableProperties(processId); TableUtil tableUtil = new TableUtil(); Table table = tableUtil.formTableFromConfig(processId); FileSystem fs = FileSystem.get(conf); LOGGER.info("Default FS =" + conf.get("fs.defaultFS")); //set in the conf for mappers to use conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator")); conf.set(Config.PID_KEY, processId); conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows"))); conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits"))); Job job = Job.getInstance(conf); Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName()); FileOutputFormat.setOutputPath(job, mrOutputPath); job.setJobName("Datagen-" + table.getTableName()); job.setJarByClass(Driver.class); job.setMapperClass(RecordGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); //merge and create a single file Path srcDir = mrOutputPath; Path destFile = new Path(outputDir.toString() + "/" + table.getTableName()); FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, ""); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash("0"); registerFileInfo.setFileSize(0L); registerFileInfo.setPath(destFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java
License:Apache License
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); String dir = conf.get(LindenJobConfig.INPUT_DIR, null); logger.info("input dir:" + dir); Path inputPath = new Path(StringUtils.unEscapeString(dir)); Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR)); String indexPath = conf.get(LindenJobConfig.INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true);/*from w w w. j a v a2s . c om*/ } if (fs.exists(new Path(indexPath))) { fs.delete(new Path(indexPath), true); } int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1); Shard[] shards = createShards(indexPath, numShards); Shard.setIndexShards(conf, shards); //empty trash; (new Trash(conf)).expunge(); Job job = Job.getInstance(conf, "linden-hadoop-indexing"); job.setJarByClass(LindenJob.class); job.setMapperClass(LindenMapper.class); job.setCombinerClass(LindenCombiner.class); job.setReducerClass(LindenReducer.class); job.setMapOutputKeyClass(Shard.class); job.setMapOutputValueClass(IntermediateForm.class); job.setOutputKeyClass(Shard.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(IndexUpdateOutputFormat.class); job.setReduceSpeculativeExecution(false); job.setNumReduceTasks(numShards); String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL); if (lindenSchemaFile == null) { throw new IOException("no schema file is found"); } logger.info("Adding schema file: " + lindenSchemaFile); job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema")); String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL); if (lindenPropertiesFile == null) { throw new IOException("no linden properties file is found"); } logger.info("Adding linden properties file: " + lindenPropertiesFile); job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties")); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); Path[] inputs = FileInputFormat.getInputPaths(job); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapreduce.input.dir = " + buffer.toString()); logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString()); logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS)); logger.info("mapreduce.input.format.class = " + job.getInputFormatClass()); logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass()); logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR)); job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed"); } return 0; }
From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); Options options = buildOptions();/* w ww. ja v a 2 s. co m*/ CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("h") || cmd.getArgs().length == 0) { printHelpAndExit(options); } String hdfsPath = cmd.getArgs()[0]; Configuration conf = getConf(); conf.setBoolean("mapred.map.tasks.speculative.execution", false); if (cmd.hasOption("topics")) { LOG.info("Using topics: " + cmd.getOptionValue("topics")); KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics")); } else { printHelpAndExit(options); } KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181")); if (cmd.hasOption("consumer-group")) { CheckpointManager.configureUseZooKeeper(conf, cmd.getOptionValue("consumer-group", "dev-hadoop-loader")); } if (cmd.getOptionValue("autooffset-reset") != null) { KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset")); } JobConf jobConf = new JobConf(conf); if (cmd.hasOption("remote")) { String ip = cmd.getOptionValue("remote"); LOG.info("Default file system: hdfs://" + ip + ":8020/"); jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/"); LOG.info("Remote jobtracker: " + ip + ":8021"); jobConf.set("mapred.job.tracker", ip + ":8021"); } Path jarTarget = new Path( getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar"); if (new File(jarTarget.toUri()).exists()) { // running from IDE/ as maven jobConf.setJar(jarTarget.toUri().getPath()); LOG.info("Using target jar: " + jarTarget.toString()); } else { // running from jar remotely or locally jobConf.setJarByClass(getClass()); LOG.info("Using parent jar: " + jobConf.getJar()); } Job job = Job.getInstance(jobConf, "kafka.hadoop.loader"); job.setInputFormatClass(KafkaInputFormat.class); job.setMapperClass(HadoopJobMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); MultiOutputFormat.setOutputPath(job, new Path(hdfsPath)); MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on")); LOG.info("Output hdfs location: {}", hdfsPath); LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job)); return job.waitForCompletion(true) ? 0 : -1; }