Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java

License:Open Source License

/**
 * Sets up various job properites required for the indexing job.
 * If your implementation needs to mess with the conf, you can do so by overriding
 * this method (remember to call super.setupJob()!) or in setMapper().
 * @param conf/*from w  w  w. java  2 s.c o  m*/
 * @return
 * @throws IOException
 */
protected Job setupJob(Configuration conf) throws IOException {
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(BlockIndexedFileInputFormat.class);
    job.setReducerClass(MapFileIndexingReducer.class);
    job.setMapOutputKeyClass(TextLongPairWritable.class);
    job.setMapOutputValueClass(LongPairWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ListLongPair.class);
    job.setPartitionerClass(TextLongPairWritable.Parititioner.class);
    job.setSortComparatorClass(TextLongPairWritable.PairComparator.class);
    job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setNumReduceTasks(getNumPartitions());
    BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(),
            getColumnName());
    return job;
}

From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java

License:Apache License

/**
 * Override and extend this in implementations to add custom settings to the Job and Conf to
 * create lucene-based indexes that will point you at what splits contain values you are looking for.
 * You are on your own for filtering the splits appropriately before creating an MR job.. but
 * check out how this was done over MapFile-based indexes in
 * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat
 */// ww w.  j  a va 2s  .  c o m
@Override
protected void setupJob(Job job) {
    Configuration conf = job.getConfiguration();
    conf.set("mapred.child.java.opts", "-Xmx4g");
    List<String> fieldNames = Lists.newArrayList();
    for (IndexedField field : getIndexedFields()) {
        fieldNames.add(field.getFieldName());
        conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(),
                getExtractorClassName(field.getFieldName()));
    }
    conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {}));
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(HadoopSplitDocument.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setInputFormatClass(getInputFormatClass());

    job.setMapperClass(HadoopSplitIndexingMapper.class);
    job.setReducerClass(HadoopSplitIndexingReducer.class);
}

From source file:com.twitter.elephanttwin.lucene.indexing.TextIndexingJob.java

License:Apache License

@Override
protected void setupJob(Job job) {
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(TextIndexingMapper.class);
    job.setReducerClass(TextIndexingReducer.class);
}

From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    params = new IndexConfig();

    LOG.info(" - input: " + Joiner.on(" ").join(params.getInput()));
    LOG.info(" - output: " + IndexConfig.output.get());

    Configuration conf = getConf();

    Path outputDir = new Path(params.getOutput());
    FileSystem fs = outputDir.getFileSystem(conf);
    fs.delete(outputDir, true);/*from  w  w w.j av  a2 s  . c  om*/

    int totalInputFiles = 0;
    List<FileStatus> stats = Lists.newArrayList();
    for (String s : params.getInput()) {
        Path spath = new Path(IndexConfig.index.get() + s);
        HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter);
    }

    totalInputFiles = stats.size();
    LOG.info(totalInputFiles + " total index files to be scanned");

    conf.set(IndexScanMapper.searchColumnName, params.getColumnName());
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(params.getOutput()));

    for (FileStatus file : stats)
        FileInputFormat.addInputPath(job, file.getPath());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setNumReduceTasks(1);

    job.setMapperClass(IndexScanMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get());
    BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(),
            params.getIndex(), (String) null);
    job.waitForCompletion(true);
    return 0;
}

From source file:com.twitter.elephanttwin.retrieval.TestIndexedReader.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    LOG.info(" - input: " + CmdParams.input.get());
    LOG.info(" - output: " + CmdParams.output.get());

    Configuration conf = getConf();
    BinaryExpression filter = new BinaryExpression(new Expression.Column(CmdParams.columnname.get()),
            new Expression.Const(CmdParams.searchvalue.get()), Expression.OpType.OP_EQ);

    String filterCondString = ObjectSerializer.serialize(filter);
    conf.set(OneSplitInputFormat.FILTERCONDITIONS, filterCondString);

    Path outputDir = new Path(CmdParams.output.get());
    FileSystem fs = outputDir.getFileSystem(conf);
    fs.delete(outputDir, true);/* w ww  .  jav a2 s. co  m*/

    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(OneSplitInputFormat.class);
    OneSplitInputFormat.setSplit(job, CmdParams.startoffset.get(), CmdParams.endoffset.get());
    OneSplitInputFormat.setIndexOptions(job, CmdParams.inputformat.get(), CmdParams.value_class.get(), "/tmp",
            CmdParams.columnname.get());

    job.setMapperClass(PrintBinaryMapper.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(CmdParams.output.get()));
    FileInputFormat.setInputPaths(job, CmdParams.input.get());
    job.setJobName("TestIndexedLZOReader:" + CmdParams.input.get());

    job.waitForCompletion(true);
    return 0;
}

From source file:com.twitter.hraven.etl.JobFileProcessor.java

License:Apache License

/**
 * @param conf/*from   w  w  w. ja  v a 2s. c  o  m*/
 *          to use to create and run the job
 * @param scan
 *          to be used to scan the raw table.
 * @param totalJobCount
 *          the total number of jobs that need to be run in this batch. Used
 *          in job name.
 * @return The job to be submitted to the cluster.
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount) throws IOException {

    Configuration confClone = new Configuration(conf);

    // Turn off speculative execution.
    // Note: must be BEFORE the job construction with the new mapreduce API.
    confClone.setBoolean("mapred.map.tasks.speculative.execution", false);

    // Set up job
    Job job = new Job(confClone, getJobName(totalJobCount));

    // This is a map-only class, skip reduce step
    job.setNumReduceTasks(0);
    job.setJarByClass(JobFileProcessor.class);
    job.setOutputFormatClass(MultiTableOutputFormat.class);

    TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan, JobFileTableMapper.class,
            JobFileTableMapper.getOutputKeyClass(), JobFileTableMapper.getOutputValueClass(), job);

    return job;
}

From source file:com.twitter.hraven.etl.JobFileRawLoader.java

License:Apache License

/**
 * @param conf// ww  w.j a v  a  2 s  . co  m
 *          to use to create and run the job. Should be an HBase
 *          configuration.
 * @param input
 *          path to the processFile * @param totalJobCount the total number of
 *          jobs that need to be run in this batch. Used in job name.
 * @return whether all job confs were loaded properly.
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private boolean runRawLoaderJob(Configuration myHBaseConf, String input, int totalJobCount)
        throws IOException, InterruptedException, ClassNotFoundException {
    boolean success;

    // Turn off speculative execution.
    // Note: must be BEFORE the job construction with the new mapreduce API.
    myHBaseConf.setBoolean("mapred.map.tasks.speculative.execution", false);

    // Set up job
    Job job = new Job(myHBaseConf, getJobName(totalJobCount));
    job.setJarByClass(JobFileRawLoader.class);

    Path inputPath = new Path(input);

    if (hdfs.exists(inputPath)) {

        // Set input
        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.setInputPaths(job, inputPath);

        job.setMapperClass(JobFileRawLoaderMapper.class);

        // Set the output format to push data into HBase.
        job.setOutputFormatClass(TableOutputFormat.class);
        TableMapReduceUtil.initTableReducerJob(Constants.HISTORY_RAW_TABLE, null, job);

        job.setOutputKeyClass(JobFileRawLoaderMapper.getOutputKeyClass());
        job.setOutputValueClass(JobFileRawLoaderMapper.getOutputValueClass());

        // This is a map-only class, skip reduce step
        job.setNumReduceTasks(0);

        // Run the job
        success = job.waitForCompletion(true);

        if (success) {
            success = hdfs.delete(inputPath, false);
        }

    } else {
        System.err.println("Unable to find processFile: " + inputPath);
        success = false;
    }
    return success;
}

From source file:com.twitter.scalding.parquet.scrooge.TestCorruptScroogeRecords.java

License:Apache License

@Override
public void setupJob(Job job, Path path) throws Exception {
    job.setInputFormatClass(ParquetScroogeInputFormat.class);
    ParquetScroogeInputFormat.setInputPaths(job, path);
    ParquetScroogeInputFormat.setThriftClass(job.getConfiguration(), StructWithUnionV2.class);

    ThriftReadSupport.setRecordConverterClass(job.getConfiguration(), ScroogeRecordConverter.class);

    job.setMapperClass(ReadMapper.class);
    job.setNumReduceTasks(0);/*from   w  ww.  ja  v a 2s .c om*/
    job.setOutputFormatClass(NullOutputFormat.class);
}

From source file:com.veera.secondarysort.demo2.SsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(StockKey.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);

    job.waitForCompletion(true);/*w  w w  .  j  a va  2s.c  om*/

    return 0;
}

From source file:com.wibidata.wibidota.DotaGatherExampleValues.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Gatherer Example Values");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setMapperClass(EnumGatherMap.class);
    job.setCombinerClass(AppendText.class);
    job.setReducerClass(EnumGatherReducer.class);

    job.setJarByClass(DotaGatherExampleValues.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {/*from   w  w  w  .j a  va 2  s .c o m*/
        return -1;
    }
}