Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the key class for the map output data.

Usage

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat}. One of At and B must also conform with
 * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details.
 * //from   w w  w. ja v  a  2  s . c o m
 * @param conf the initial configuration
 * @param mapDirPath path to the matrix in {@link MapDir} format
 * @param matrixInputPaths the list of paths to matrix input partitions over
 *          which we iterate
 * @param matrixOutputPath path to which AxB will be written
 * @param atCols number of columns of At (rows of A)
 * @param bCols
 * @param colsPerPartition cols per partition of the input matrix (whether At or B)
 * @param aIsMapDir is A chosen to be loaded as MapDir
 * @param useInMemCombiner
 * @param numberOfJobs the hint for the desired number of parallel jobs
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols,
        int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(MATRIXINMEMORY, mapDirPath.toString());
    conf.setBoolean(AISMAPDIR, aIsMapDir);
    conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner);
    conf.setInt(RESULTROWS, atCols);
    conf.setInt(RESULTCOLS, bCols);
    conf.setInt(PARTITIONCOLS, colsPerPartition);
    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj");

    if (useInMemCombiner) {
        Configuration newConf = new Configuration(conf);
        newConf.set("mapreduce.task.io.sort.mb", "1");
        conf = newConf;
    }

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(AtB_DMJ.class);
    job.setJobName(AtB_DMJ.class.getSimpleName());
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    matrixInputPaths = fs.makeQualified(matrixInputPaths);
    MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    if (!useInMemCombiner)
        job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols);

    job.setReducerClass(EpsilonReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    return job;
}

From source file:com.twitter.algebra.nmf.ColPartitionJob.java

License:Apache License

/**
 * Partition A on columns, where A refers to the path that contain a matrix in
 * {@link SequenceFileInputFormat}. Refer to {@link ColPartitionJob} for
 * further details.// ww  w.j  a va2 s. c  o  m
 * 
 * @param conf the initial configuration
 * @param matrixInputPath the path to the input matrix A
 * @param matrixOutputPath the path of the resulting partitioned matrix
 * @param numInputRows rows
 * @param numInputCols cols
 * @param numColPartitions the hint for the desired number of column
 *          partitions
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols, int numColPartitions)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "colpartition");

    int colPartSize = getColPartitionSize(numInputCols, numColPartitions);
    numColPartitions = (int) Math.ceil(numInputCols / (double) colPartSize);

    if (numReducers < numColPartitions)
        numReducers = numColPartitions;

    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "colpartition");

    conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows);
    conf.setInt(NUM_ORIG_COLS_KEY, numInputCols);
    conf.setInt(NUM_COL_PARTITIONS, numColPartitions);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ColPartitionJob.class);
    job.setJobName(ColPartitionJob.class.getSimpleName());

    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(ElementWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    RowColPartitioner.setPartitioner(job, RowColPartitioner.ElementRowColPartitioner.class, numInputRows,
            numInputCols, numColPartitions);

    job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(numReducers);

    //    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
    return job;
}

From source file:com.twitter.algebra.nmf.CompositeDMJ.java

License:Apache License

public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols,
        boolean aIsMapDir, String inMemCStr, int inMemCRows, int inMemCCols, float alpha1, float alpha2)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);
    conf.set(MATRIXINMEMORY, inMemCStr);
    conf.setInt(MATRIXINMEMORYROWS, inMemCRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemCCols);

    conf.setFloat(ALPHA1, alpha1);/*  w  w  w  .ja  v a2  s. c o m*/
    conf.setFloat(ALPHA2, alpha2);

    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "compositedmj");

    conf.set(MAPDIRMATRIX, mapDirPath.toString());
    conf.setBoolean(AISMAPDIR, aIsMapDir);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(CompositeDMJ.class);
    job.setJobName(CompositeDMJ.class.getSimpleName() + "-" + matrixOutputPath.getName());
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    matrixInputPaths = fs.makeQualified(matrixInputPaths);
    MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    return job;
}

From source file:com.twitter.algebra.nmf.DistRndMatrixJob.java

License:Apache License

public void run(Configuration conf, Path inPath, Path matrixOutputPath, int numInputRows, int numInputCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.setInt(ROWS, numInputRows);//from w  w w .  j  a  v a  2  s  .  c  o  m
    conf.setInt(COLS, numInputCols);

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(DistRndMatrixJob.class);
    job.setJobName(DistRndMatrixJob.class.getSimpleName() + "-" + matrixOutputPath.getName());

    FileSystem fs = FileSystem.get(inPath.toUri(), conf);
    inPath = fs.makeQualified(inPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, inPath);
    job.setInputFormatClass(TextInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "random");
    job.setNumReduceTasks(numReducers);

    job.setReducerClass(MyReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.Edge2MapDirJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols, String name) throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(INDEXNAME, name);//from   w  ww  . ja v a 2  s  .  c  om
    conf.setInt(ROWS, numInputRows);
    conf.setInt(COLS, numInputCols);
    conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t");
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "edge2matrix");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Edge2MapDirJob.class);
    job.setJobName(Edge2MapDirJob.class.getSimpleName() + "-" + matrixOutputPath.getName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(KeyValueTextInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "edge2matrix");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows);

    job.setCombinerClass(MergeVectorsCombiner.class);
    job.setReducerClass(MergeVectorsReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.ErrDMJ.java

License:Apache License

public Job run(Configuration conf, Path xPath, Path matrixAInputPath, Path ytPath, Path outPath, int aRows,
        int ytRows, int ytCols) throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(MAPDIRMATRIXX, xPath.toString());
    conf.set(MAPDIRMATRIXYT, ytPath.toString());
    conf.setInt(YTROWS, ytRows);/*from   w ww  . j a  va2 s  .c  o  m*/
    conf.setInt(YTCOLS, ytCols);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixAInputPath, "err");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ErrDMJ.class);
    job.setJobName(ErrDMJ.class.getSimpleName() + "-" + outPath.getName());

    matrixAInputPath = fs.makeQualified(matrixAInputPath);
    MultipleInputs.addInputPath(job, matrixAInputPath, SequenceFileInputFormat.class);

    outPath = fs.makeQualified(outPath);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = 1;
    job.setNumReduceTasks(numReducers);
    job.setCombinerClass(SumVectorsReducer.class);
    job.setReducerClass(SumVectorsReducer.class);

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed! ");
    return job;
}

From source file:com.twitter.algebra.nmf.ReindexerJob.java

License:Apache License

public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ReindexerJob.class);
    job.setJobName(ReindexerJob.class.getSimpleName() + "-" + matrixOutputPath.getName());

    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(KeyValueTextInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setReducerClass(MyReducer.class);
    // this makes the reindexing very slow but is necessary to have total order
    job.setNumReduceTasks(1);/*from ww w  .  j  av a 2 s  .c  om*/

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
    return job;
}

From source file:com.twitter.algebra.nmf.XtXJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, int numCols, String xmPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.setInt(MATRIXCOLS, numCols);/*from  w  w  w  .  j a v a  2  s .co  m*/
    //    conf.set(XMPATH, xmPath);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, new Path[] { matrixInputPath }, "xtx");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJobName("XtXJob-" + matrixOutputPath.getName());
    job.setJarByClass(XtXJob.class);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);
    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "xtx");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numCols);

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    job.waitForCompletion(true);
}

From source file:com.twitter.algebra.TransposeJob.java

License:Apache License

/**
 * Perform transpose of A, where A refers to the path that contains a matrix
 * in {@link SequenceFileInputFormat}./*w w w.  j ava 2  s .  c o m*/
 * 
 * @param conf
 *          the initial configuration
 * @param matrixInputPath
 *          the path to the input files that we process
 * @param matrixOutputPath
 *          the path of the resulting transpose matrix
 * @param numInputRows
 *          rows
 * @param numInputCols
 *          cols
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols) throws IOException, InterruptedException, ClassNotFoundException {
    conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows);
    conf.setInt(RowPartitioner.TOTAL_KEYS, numInputCols);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "transpose");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(TransposeJob.class);
    job.setJobName(TransposeJob.class.getSimpleName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(TransposeMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "transpose");
    job.setNumReduceTasks(numReducers);
    //    job.setPartitionerClass(RowPartitioner.IntRowPartitioner.class);
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputCols);
    job.setCombinerClass(MergeVectorsCombiner.class);
    job.setReducerClass(MergeVectorsReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java

License:Open Source License

/**
 * Sets up various job properites required for the indexing job.
 * If your implementation needs to mess with the conf, you can do so by overriding
 * this method (remember to call super.setupJob()!) or in setMapper().
 * @param conf//from  w w w  . j  ava  2 s  .c  o m
 * @return
 * @throws IOException
 */
protected Job setupJob(Configuration conf) throws IOException {
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(BlockIndexedFileInputFormat.class);
    job.setReducerClass(MapFileIndexingReducer.class);
    job.setMapOutputKeyClass(TextLongPairWritable.class);
    job.setMapOutputValueClass(LongPairWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ListLongPair.class);
    job.setPartitionerClass(TextLongPairWritable.Parititioner.class);
    job.setSortComparatorClass(TextLongPairWritable.PairComparator.class);
    job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setNumReduceTasks(getNumPartitions());
    BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(),
            getColumnName());
    return job;
}