Example usage for org.apache.hadoop.mapreduce Job Job

List of usage examples for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobConf conf) throws IOException 

Source Link

Usage

From source file:com.talis.hadoop.rdf.merge.IndexMerge.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }/*www  .j a va 2s  .c o  m*/

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(IndexMergeReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.talis.hadoop.rdf.solr.QuadsIndexer.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }/*  w  w  w .j  a  v  a 2  s . c  o m*/

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem outputFs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        outputFs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    int shards = -1;
    boolean compressOutput = false;

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    Path solrConfig = new Path(args[2]);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    if (shards > 0) {
        job.setNumReduceTasks(shards);
    }

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setReducerClass(SolrReducer.class);
    SolrDocumentConverter.setSolrDocumentConverter(LiteralsIndexer.class, job.getConfiguration());

    job.setOutputFormatClass(SolrOutputFormat.class);

    String zipName = "solr.zip";
    FileSystem solrConfigFs = FileSystem.get(solrConfig.toUri(), configuration);
    final URI baseZipUrl = solrConfigFs.getUri().resolve(solrConfig.toString() + '#' + zipName);
    DistributedCache.addCacheArchive(baseZipUrl, job.getConfiguration());
    job.getConfiguration().set(SolrOutputFormat.SETUP_OK, solrConfig.toString());
    SolrOutputFormat.setOutputZipFormat(compressOutput, job.getConfiguration());

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.test.hadoop.unoExample.CardDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    String input, output;/*from   www . j a  v a  2  s.  c  om*/
    if (args.length == 2) {
        input = args[0];
        output = args[1];
    } else {
        System.err.println("Incorrect number of arguments.  Expected: input output");
        return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(CardDriver.class);
    job.setJobName(this.getClass().getName());

    FileInputFormat.setInputPaths(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setMapperClass(CardMapper.class);
    job.setReducerClass(CardTotalReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "seq2mtx");
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Sequence2MatrixFormatJob.class);
    job.setJobName(Sequence2MatrixFormatJob.class.getSimpleName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setNumReduceTasks(0);//from w ww .  j  av  a2s . c  o m

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java

License:Apache License

/**
 * Perform A x B, where A and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat} Refer to {@link ABInnerHDFSBroadcastOfB}
 * for further details./*from  w  w w  .  ja v  a2  s  . c  o m*/
 * 
 * @param conf the initial configuration
 * @param matrixInputPath path to matrix A
 * @param inMemMatrixDir path to matrix B (must be small enough to fit into
 *          memory)
 * @param matrixOutputPath path to which AxB will be written
 * @param inMemMatrixNumRows B rows
 * @param inMemMatrixNumCols B cols
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, Path matrixInputPath, String inMemMatrixDir, Path matrixOutputPath,
        int inMemMatrixNumRows, int inMemMatrixNumCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "axbinner");
    conf.set(MATRIXINMEMORY, inMemMatrixDir);
    conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ABInnerHDFSBroadcastOfB.class);
    job.setJobName(ABInnerHDFSBroadcastOfB.class.getSimpleName());
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    // since we do not use reducer, to get total order, the map output files has
    // to be renamed after this function returns: {@link
    // AlgebraCommon#fixPartitioningProblem}
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java

License:Apache License

/**
 * Perform A x B, where A and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat} Refer to {@link ABOuterHDFSBroadcastOfA}
 * for further details./*  ww w. ja  va 2s .com*/
 * 
 * @param conf
 *          the initial configuration
 * @param matrixInputPath
 *          path to matrix A
 * @param inMemMatrixDir
 *          path to matrix B (must be small enough to fit into memory)
 * @param matrixOutputPath
 *          path to which AxB will be written
 * @param inMemMatrixNumRows
 *          B rows
 * @param inMemMatrixNumCols
 *          B cols
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, String inMemMatrixDir, Path matrixInputPath, Path matrixOutputPath,
        int inMemMatrixNumRows, int inMemMatrixNumCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(MATRIXINMEMORY, inMemMatrixDir);
    conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ABOuterHDFSBroadcastOfA.class);
    job.setJobName(ABOuterHDFSBroadcastOfA.class.getSimpleName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, inMemMatrixNumRows);

    job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);

    job.setReducerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java

License:Apache License

public void run(Configuration conf, Path atPath, Path bPath, Path outPath, int outCardinality)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.setInt(OUT_CARD, outCardinality);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJobName(AtBOuterStaticMapsideJoinJob.class.getSimpleName());
    job.setJarByClass(AtBOuterStaticMapsideJoinJob.class);

    FileSystem fs = FileSystem.get(atPath.toUri(), conf);
    atPath = fs.makeQualified(atPath);/*  w w  w .j a v a  2 s.c  o m*/
    bPath = fs.makeQualified(bPath);
    job.setInputFormatClass(CompositeInputFormat.class);
    //mapside join expression
    job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR,
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, atPath, bPath));

    job.setOutputFormatClass(MatrixOutputFormat.class);
    outPath = fs.makeQualified(outPath);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setCombinerClass(MyReducer.class);

    int numReducers = conf.getInt("algebra.reduceslots.multiply", 10);
    job.setNumReduceTasks(numReducers);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed");
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat}. One of At and B must also conform with
 * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details.
 * //from  w  w  w.  j a v a 2  s.c o m
 * @param conf the initial configuration
 * @param mapDirPath path to the matrix in {@link MapDir} format
 * @param matrixInputPaths the list of paths to matrix input partitions over
 *          which we iterate
 * @param matrixOutputPath path to which AxB will be written
 * @param atCols number of columns of At (rows of A)
 * @param bCols
 * @param colsPerPartition cols per partition of the input matrix (whether At or B)
 * @param aIsMapDir is A chosen to be loaded as MapDir
 * @param useInMemCombiner
 * @param numberOfJobs the hint for the desired number of parallel jobs
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols,
        int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(MATRIXINMEMORY, mapDirPath.toString());
    conf.setBoolean(AISMAPDIR, aIsMapDir);
    conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner);
    conf.setInt(RESULTROWS, atCols);
    conf.setInt(RESULTCOLS, bCols);
    conf.setInt(PARTITIONCOLS, colsPerPartition);
    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj");

    if (useInMemCombiner) {
        Configuration newConf = new Configuration(conf);
        newConf.set("mapreduce.task.io.sort.mb", "1");
        conf = newConf;
    }

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(AtB_DMJ.class);
    job.setJobName(AtB_DMJ.class.getSimpleName());
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    matrixInputPaths = fs.makeQualified(matrixInputPaths);
    MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    if (!useInMemCombiner)
        job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols);

    job.setReducerClass(EpsilonReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    return job;
}

From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows, int partitions)
        throws IOException, InterruptedException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(PartitionerJob.class);
    job.setJobName(PartitionerJob.class.getSimpleName() + "-" + matrixOutputPath.getName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);

    job.setNumReduceTasks(partitions);//from  w w w  . j ava  2 s . co  m

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(IdMapper.class);
    job.setReducerClass(IdReducer.class);

    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Matrix2TextJob.class);
    job.setJobName(Matrix2TextJob.class.getSimpleName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    //    FileInputFormat.addInputPath(job, matrixInputPath);
    MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class);
    //    job.setInputFormatClass(SequenceFileInputFormat.class);
    TextOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setNumReduceTasks(0);//from   ww  w  . j a v  a 2  s.c  om

    job.setOutputFormatClass(TextOutputFormat.class);
    //    job.setOutputKeyClass(IntWritable.class);
    //    job.setOutputValueClass(org.apache.hadoop.io.Text);
    job.setMapperClass(IdMapper.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}