List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java
License:Open Source License
/** * Sets up various job properites required for the indexing job. * If your implementation needs to mess with the conf, you can do so by overriding * this method (remember to call super.setupJob()!) or in setMapper(). * @param conf/*from w w w. java 2 s.c o m*/ * @return * @throws IOException */ protected Job setupJob(Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(BlockIndexedFileInputFormat.class); job.setReducerClass(MapFileIndexingReducer.class); job.setMapOutputKeyClass(TextLongPairWritable.class); job.setMapOutputValueClass(LongPairWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ListLongPair.class); job.setPartitionerClass(TextLongPairWritable.Parititioner.class); job.setSortComparatorClass(TextLongPairWritable.PairComparator.class); job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setNumReduceTasks(getNumPartitions()); BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(), getColumnName()); return job; }
From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java
License:Apache License
/** * Override and extend this in implementations to add custom settings to the Job and Conf to * create lucene-based indexes that will point you at what splits contain values you are looking for. * You are on your own for filtering the splits appropriately before creating an MR job.. but * check out how this was done over MapFile-based indexes in * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat */// ww w. j a va 2s . c o m @Override protected void setupJob(Job job) { Configuration conf = job.getConfiguration(); conf.set("mapred.child.java.opts", "-Xmx4g"); List<String> fieldNames = Lists.newArrayList(); for (IndexedField field : getIndexedFields()) { fieldNames.add(field.getFieldName()); conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(), getExtractorClassName(field.getFieldName())); } conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {})); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(HadoopSplitDocument.class); job.setOutputFormatClass(NullOutputFormat.class); job.setInputFormatClass(getInputFormatClass()); job.setMapperClass(HadoopSplitIndexingMapper.class); job.setReducerClass(HadoopSplitIndexingReducer.class); }
From source file:com.twitter.elephanttwin.lucene.indexing.TextIndexingJob.java
License:Apache License
@Override protected void setupJob(Job job) { job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(TextIndexingMapper.class); job.setReducerClass(TextIndexingReducer.class); }
From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { params = new IndexConfig(); LOG.info(" - input: " + Joiner.on(" ").join(params.getInput())); LOG.info(" - output: " + IndexConfig.output.get()); Configuration conf = getConf(); Path outputDir = new Path(params.getOutput()); FileSystem fs = outputDir.getFileSystem(conf); fs.delete(outputDir, true);/*from w w w.j av a2 s . c om*/ int totalInputFiles = 0; List<FileStatus> stats = Lists.newArrayList(); for (String s : params.getInput()) { Path spath = new Path(IndexConfig.index.get() + s); HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter); } totalInputFiles = stats.size(); LOG.info(totalInputFiles + " total index files to be scanned"); conf.set(IndexScanMapper.searchColumnName, params.getColumnName()); Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(params.getOutput())); for (FileStatus file : stats) FileInputFormat.addInputPath(job, file.getPath()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); job.setMapperClass(IndexScanMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get()); BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(), params.getIndex(), (String) null); job.waitForCompletion(true); return 0; }
From source file:com.twitter.elephanttwin.retrieval.TestIndexedReader.java
License:Apache License
@Override public int run(String[] args) throws Exception { LOG.info(" - input: " + CmdParams.input.get()); LOG.info(" - output: " + CmdParams.output.get()); Configuration conf = getConf(); BinaryExpression filter = new BinaryExpression(new Expression.Column(CmdParams.columnname.get()), new Expression.Const(CmdParams.searchvalue.get()), Expression.OpType.OP_EQ); String filterCondString = ObjectSerializer.serialize(filter); conf.set(OneSplitInputFormat.FILTERCONDITIONS, filterCondString); Path outputDir = new Path(CmdParams.output.get()); FileSystem fs = outputDir.getFileSystem(conf); fs.delete(outputDir, true);/* w ww . jav a2 s. co m*/ Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(OneSplitInputFormat.class); OneSplitInputFormat.setSplit(job, CmdParams.startoffset.get(), CmdParams.endoffset.get()); OneSplitInputFormat.setIndexOptions(job, CmdParams.inputformat.get(), CmdParams.value_class.get(), "/tmp", CmdParams.columnname.get()); job.setMapperClass(PrintBinaryMapper.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(CmdParams.output.get())); FileInputFormat.setInputPaths(job, CmdParams.input.get()); job.setJobName("TestIndexedLZOReader:" + CmdParams.input.get()); job.waitForCompletion(true); return 0; }
From source file:com.twitter.hraven.etl.JobFileProcessor.java
License:Apache License
/** * @param conf/*from w w w. ja v a 2s. c o m*/ * to use to create and run the job * @param scan * to be used to scan the raw table. * @param totalJobCount * the total number of jobs that need to be run in this batch. Used * in job name. * @return The job to be submitted to the cluster. * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount) throws IOException { Configuration confClone = new Configuration(conf); // Turn off speculative execution. // Note: must be BEFORE the job construction with the new mapreduce API. confClone.setBoolean("mapred.map.tasks.speculative.execution", false); // Set up job Job job = new Job(confClone, getJobName(totalJobCount)); // This is a map-only class, skip reduce step job.setNumReduceTasks(0); job.setJarByClass(JobFileProcessor.class); job.setOutputFormatClass(MultiTableOutputFormat.class); TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan, JobFileTableMapper.class, JobFileTableMapper.getOutputKeyClass(), JobFileTableMapper.getOutputValueClass(), job); return job; }
From source file:com.twitter.hraven.etl.JobFileRawLoader.java
License:Apache License
/** * @param conf// ww w.j a v a 2 s . co m * to use to create and run the job. Should be an HBase * configuration. * @param input * path to the processFile * @param totalJobCount the total number of * jobs that need to be run in this batch. Used in job name. * @return whether all job confs were loaded properly. * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private boolean runRawLoaderJob(Configuration myHBaseConf, String input, int totalJobCount) throws IOException, InterruptedException, ClassNotFoundException { boolean success; // Turn off speculative execution. // Note: must be BEFORE the job construction with the new mapreduce API. myHBaseConf.setBoolean("mapred.map.tasks.speculative.execution", false); // Set up job Job job = new Job(myHBaseConf, getJobName(totalJobCount)); job.setJarByClass(JobFileRawLoader.class); Path inputPath = new Path(input); if (hdfs.exists(inputPath)) { // Set input job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(job, inputPath); job.setMapperClass(JobFileRawLoaderMapper.class); // Set the output format to push data into HBase. job.setOutputFormatClass(TableOutputFormat.class); TableMapReduceUtil.initTableReducerJob(Constants.HISTORY_RAW_TABLE, null, job); job.setOutputKeyClass(JobFileRawLoaderMapper.getOutputKeyClass()); job.setOutputValueClass(JobFileRawLoaderMapper.getOutputValueClass()); // This is a map-only class, skip reduce step job.setNumReduceTasks(0); // Run the job success = job.waitForCompletion(true); if (success) { success = hdfs.delete(inputPath, false); } } else { System.err.println("Unable to find processFile: " + inputPath); success = false; } return success; }
From source file:com.twitter.scalding.parquet.scrooge.TestCorruptScroogeRecords.java
License:Apache License
@Override public void setupJob(Job job, Path path) throws Exception { job.setInputFormatClass(ParquetScroogeInputFormat.class); ParquetScroogeInputFormat.setInputPaths(job, path); ParquetScroogeInputFormat.setThriftClass(job.getConfiguration(), StructWithUnionV2.class); ThriftReadSupport.setRecordConverterClass(job.getConfiguration(), ScroogeRecordConverter.class); job.setMapperClass(ReadMapper.class); job.setNumReduceTasks(0);/*from w ww. ja v a 2s .c om*/ job.setOutputFormatClass(NullOutputFormat.class); }
From source file:com.veera.secondarysort.demo2.SsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SsJob.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapOutputKeyClass(StockKey.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SsMapper.class); job.setReducerClass(SsReducer.class); job.waitForCompletion(true);/*w w w . j a va 2s.c om*/ return 0; }
From source file:com.wibidata.wibidota.DotaGatherExampleValues.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Gatherer Example Values"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(EnumGatherMap.class); job.setCombinerClass(AppendText.class); job.setReducerClass(EnumGatherReducer.class); job.setJarByClass(DotaGatherExampleValues.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {/*from w w w .j a va 2 s .c o m*/ return -1; } }