List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.twitter.algebra.nmf.SampleColsJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, int cols, Path matrixOutputPath, float sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setFloat(SAMPLERATE, sampleRate); conf.setInt(COLS, cols);/*from w w w. jav a 2s .c o m*/ FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "samplecol"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(SampleColsJob.class); job.setJobName(SampleColsJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.SampleRowsJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, float sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setFloat(SAMPLERATE, sampleRate); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "samplerows"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(SampleRowsJob.class); job.setJobName(SampleRowsJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0);// www. j a v a 2 s .com job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.XtXJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, int numCols, String xmPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setInt(MATRIXCOLS, numCols);// w ww . ja v a 2s . c om // conf.set(XMPATH, xmPath); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, new Path[] { matrixInputPath }, "xtx"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJobName("XtXJob-" + matrixOutputPath.getName()); job.setJarByClass(XtXJob.class); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "xtx"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numCols); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); job.waitForCompletion(true); }
From source file:com.twitter.algebra.TransposeJob.java
License:Apache License
/** * Perform transpose of A, where A refers to the path that contains a matrix * in {@link SequenceFileInputFormat}./*from w w w. jav a 2 s.c o m*/ * * @param conf * the initial configuration * @param matrixInputPath * the path to the input files that we process * @param matrixOutputPath * the path of the resulting transpose matrix * @param numInputRows * rows * @param numInputCols * cols * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols) throws IOException, InterruptedException, ClassNotFoundException { conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows); conf.setInt(RowPartitioner.TOTAL_KEYS, numInputCols); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "transpose"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(TransposeJob.class); job.setJobName(TransposeJob.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(TransposeMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "transpose"); job.setNumReduceTasks(numReducers); // job.setPartitionerClass(RowPartitioner.IntRowPartitioner.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputCols); job.setCombinerClass(MergeVectorsCombiner.class); job.setReducerClass(MergeVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java
License:Open Source License
/** * Sets up various job properites required for the indexing job. * If your implementation needs to mess with the conf, you can do so by overriding * this method (remember to call super.setupJob()!) or in setMapper(). * @param conf/*www . j ava 2s .co m*/ * @return * @throws IOException */ protected Job setupJob(Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(BlockIndexedFileInputFormat.class); job.setReducerClass(MapFileIndexingReducer.class); job.setMapOutputKeyClass(TextLongPairWritable.class); job.setMapOutputValueClass(LongPairWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ListLongPair.class); job.setPartitionerClass(TextLongPairWritable.Parititioner.class); job.setSortComparatorClass(TextLongPairWritable.PairComparator.class); job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setNumReduceTasks(getNumPartitions()); BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(), getColumnName()); return job; }
From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java
License:Apache License
/** * Override and extend this in implementations to add custom settings to the Job and Conf to * create lucene-based indexes that will point you at what splits contain values you are looking for. * You are on your own for filtering the splits appropriately before creating an MR job.. but * check out how this was done over MapFile-based indexes in * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat */// w ww. j a va 2 s.c o m @Override protected void setupJob(Job job) { Configuration conf = job.getConfiguration(); conf.set("mapred.child.java.opts", "-Xmx4g"); List<String> fieldNames = Lists.newArrayList(); for (IndexedField field : getIndexedFields()) { fieldNames.add(field.getFieldName()); conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(), getExtractorClassName(field.getFieldName())); } conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {})); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(HadoopSplitDocument.class); job.setOutputFormatClass(NullOutputFormat.class); job.setInputFormatClass(getInputFormatClass()); job.setMapperClass(HadoopSplitIndexingMapper.class); job.setReducerClass(HadoopSplitIndexingReducer.class); }
From source file:com.twitter.elephanttwin.lucene.indexing.TextIndexingJob.java
License:Apache License
@Override protected void setupJob(Job job) { job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(TextIndexingMapper.class); job.setReducerClass(TextIndexingReducer.class); }
From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { params = new IndexConfig(); LOG.info(" - input: " + Joiner.on(" ").join(params.getInput())); LOG.info(" - output: " + IndexConfig.output.get()); Configuration conf = getConf(); Path outputDir = new Path(params.getOutput()); FileSystem fs = outputDir.getFileSystem(conf); fs.delete(outputDir, true);//from w w w .j a v a 2s. com int totalInputFiles = 0; List<FileStatus> stats = Lists.newArrayList(); for (String s : params.getInput()) { Path spath = new Path(IndexConfig.index.get() + s); HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter); } totalInputFiles = stats.size(); LOG.info(totalInputFiles + " total index files to be scanned"); conf.set(IndexScanMapper.searchColumnName, params.getColumnName()); Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(params.getOutput())); for (FileStatus file : stats) FileInputFormat.addInputPath(job, file.getPath()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); job.setMapperClass(IndexScanMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get()); BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(), params.getIndex(), (String) null); job.waitForCompletion(true); return 0; }
From source file:com.twitter.elephanttwin.retrieval.TestIndexedReader.java
License:Apache License
@Override public int run(String[] args) throws Exception { LOG.info(" - input: " + CmdParams.input.get()); LOG.info(" - output: " + CmdParams.output.get()); Configuration conf = getConf(); BinaryExpression filter = new BinaryExpression(new Expression.Column(CmdParams.columnname.get()), new Expression.Const(CmdParams.searchvalue.get()), Expression.OpType.OP_EQ); String filterCondString = ObjectSerializer.serialize(filter); conf.set(OneSplitInputFormat.FILTERCONDITIONS, filterCondString); Path outputDir = new Path(CmdParams.output.get()); FileSystem fs = outputDir.getFileSystem(conf); fs.delete(outputDir, true);//from w ww. j a v a 2s .c o m Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(OneSplitInputFormat.class); OneSplitInputFormat.setSplit(job, CmdParams.startoffset.get(), CmdParams.endoffset.get()); OneSplitInputFormat.setIndexOptions(job, CmdParams.inputformat.get(), CmdParams.value_class.get(), "/tmp", CmdParams.columnname.get()); job.setMapperClass(PrintBinaryMapper.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(CmdParams.output.get())); FileInputFormat.setInputPaths(job, CmdParams.input.get()); job.setJobName("TestIndexedLZOReader:" + CmdParams.input.get()); job.waitForCompletion(true); return 0; }
From source file:com.twitter.hraven.etl.JobFileRawLoader.java
License:Apache License
/** * @param conf/*ww w.ja va2 s . c o m*/ * to use to create and run the job. Should be an HBase * configuration. * @param input * path to the processFile * @param totalJobCount the total number of * jobs that need to be run in this batch. Used in job name. * @return whether all job confs were loaded properly. * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private boolean runRawLoaderJob(Configuration myHBaseConf, String input, int totalJobCount) throws IOException, InterruptedException, ClassNotFoundException { boolean success; // Turn off speculative execution. // Note: must be BEFORE the job construction with the new mapreduce API. myHBaseConf.setBoolean("mapred.map.tasks.speculative.execution", false); // Set up job Job job = new Job(myHBaseConf, getJobName(totalJobCount)); job.setJarByClass(JobFileRawLoader.class); Path inputPath = new Path(input); if (hdfs.exists(inputPath)) { // Set input job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(job, inputPath); job.setMapperClass(JobFileRawLoaderMapper.class); // Set the output format to push data into HBase. job.setOutputFormatClass(TableOutputFormat.class); TableMapReduceUtil.initTableReducerJob(Constants.HISTORY_RAW_TABLE, null, job); job.setOutputKeyClass(JobFileRawLoaderMapper.getOutputKeyClass()); job.setOutputValueClass(JobFileRawLoaderMapper.getOutputValueClass()); // This is a map-only class, skip reduce step job.setNumReduceTasks(0); // Run the job success = job.waitForCompletion(true); if (success) { success = hdfs.delete(inputPath, false); } } else { System.err.println("Unable to find processFile: " + inputPath); success = false; } return success; }