Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "seq2mtx");
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Sequence2MatrixFormatJob.class);
    job.setJobName(Sequence2MatrixFormatJob.class.getSimpleName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setNumReduceTasks(0);/*w w w  . ja va 2  s  . co m*/

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java

License:Apache License

/**
 * Perform A x B, where A and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat} Refer to {@link ABInnerHDFSBroadcastOfB}
 * for further details.// ww w  . j a v a  2 s .c om
 * 
 * @param conf the initial configuration
 * @param matrixInputPath path to matrix A
 * @param inMemMatrixDir path to matrix B (must be small enough to fit into
 *          memory)
 * @param matrixOutputPath path to which AxB will be written
 * @param inMemMatrixNumRows B rows
 * @param inMemMatrixNumCols B cols
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, Path matrixInputPath, String inMemMatrixDir, Path matrixOutputPath,
        int inMemMatrixNumRows, int inMemMatrixNumCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "axbinner");
    conf.set(MATRIXINMEMORY, inMemMatrixDir);
    conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ABInnerHDFSBroadcastOfB.class);
    job.setJobName(ABInnerHDFSBroadcastOfB.class.getSimpleName());
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    // since we do not use reducer, to get total order, the map output files has
    // to be renamed after this function returns: {@link
    // AlgebraCommon#fixPartitioningProblem}
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java

License:Apache License

/**
 * Perform A x B, where A and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat} Refer to {@link ABOuterHDFSBroadcastOfA}
 * for further details./*from w w  w  . j  a va 2s .  co  m*/
 * 
 * @param conf
 *          the initial configuration
 * @param matrixInputPath
 *          path to matrix A
 * @param inMemMatrixDir
 *          path to matrix B (must be small enough to fit into memory)
 * @param matrixOutputPath
 *          path to which AxB will be written
 * @param inMemMatrixNumRows
 *          B rows
 * @param inMemMatrixNumCols
 *          B cols
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, String inMemMatrixDir, Path matrixInputPath, Path matrixOutputPath,
        int inMemMatrixNumRows, int inMemMatrixNumCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(MATRIXINMEMORY, inMemMatrixDir);
    conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ABOuterHDFSBroadcastOfA.class);
    job.setJobName(ABOuterHDFSBroadcastOfA.class.getSimpleName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, inMemMatrixNumRows);

    job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);

    job.setReducerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java

License:Apache License

public void run(Configuration conf, Path atPath, Path bPath, Path outPath, int outCardinality)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.setInt(OUT_CARD, outCardinality);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJobName(AtBOuterStaticMapsideJoinJob.class.getSimpleName());
    job.setJarByClass(AtBOuterStaticMapsideJoinJob.class);

    FileSystem fs = FileSystem.get(atPath.toUri(), conf);
    atPath = fs.makeQualified(atPath);/*ww  w .j a  v a  2  s . c  o m*/
    bPath = fs.makeQualified(bPath);
    job.setInputFormatClass(CompositeInputFormat.class);
    //mapside join expression
    job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR,
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, atPath, bPath));

    job.setOutputFormatClass(MatrixOutputFormat.class);
    outPath = fs.makeQualified(outPath);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setCombinerClass(MyReducer.class);

    int numReducers = conf.getInt("algebra.reduceslots.multiply", 10);
    job.setNumReduceTasks(numReducers);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed");
}

From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows, int partitions)
        throws IOException, InterruptedException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(PartitionerJob.class);
    job.setJobName(PartitionerJob.class.getSimpleName() + "-" + matrixOutputPath.getName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);

    job.setNumReduceTasks(partitions);//from   w w w.j a  v  a2s .c  o m

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(IdMapper.class);
    job.setReducerClass(IdReducer.class);

    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.CombinerJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);
    //    conf.setBoolean("mapreduce.output.compress", true);
    //    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
    //    conf.set("mapreduce.output.fileoutputformat.compress.codec", "com.hadoop.compression.lzo.LzoCodec");
    conf.setInt("dfs.replication", 20);

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(CombinerJob.class);
    job.setJobName(CombinerJob.class.getSimpleName() + "-" + matrixOutputPath.getName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "combiner");
    job.setNumReduceTasks(numReducers);// TODO: make it a parameter

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(IdMapper.class);
    job.setReducerClass(MergeVectorsReducer.class);

    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows);

    job.submit();/* ww  w. j  a  va 2 s  . c o m*/
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.DistRndMatrixJob.java

License:Apache License

public void run(Configuration conf, Path inPath, Path matrixOutputPath, int numInputRows, int numInputCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.setInt(ROWS, numInputRows);//from  ww w.  j a v  a 2s . c o m
    conf.setInt(COLS, numInputCols);

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(DistRndMatrixJob.class);
    job.setJobName(DistRndMatrixJob.class.getSimpleName() + "-" + matrixOutputPath.getName());

    FileSystem fs = FileSystem.get(inPath.toUri(), conf);
    inPath = fs.makeQualified(inPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, inPath);
    job.setInputFormatClass(TextInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "random");
    job.setNumReduceTasks(numReducers);

    job.setReducerClass(MyReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.Edge2MapDirJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols, String name) throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(INDEXNAME, name);// w ww.j  a  va 2 s  .c  om
    conf.setInt(ROWS, numInputRows);
    conf.setInt(COLS, numInputCols);
    conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t");
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "edge2matrix");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Edge2MapDirJob.class);
    job.setJobName(Edge2MapDirJob.class.getSimpleName() + "-" + matrixOutputPath.getName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(KeyValueTextInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "edge2matrix");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows);

    job.setCombinerClass(MergeVectorsCombiner.class);
    job.setReducerClass(MergeVectorsReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.ReindexerJob.java

License:Apache License

public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ReindexerJob.class);
    job.setJobName(ReindexerJob.class.getSimpleName() + "-" + matrixOutputPath.getName());

    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(KeyValueTextInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setReducerClass(MyReducer.class);
    // this makes the reindexing very slow but is necessary to have total order
    job.setNumReduceTasks(1);//  w w w . j a v  a2 s.c om

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
    return job;
}

From source file:com.twitter.algebra.nmf.RowSquareSumJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows)
        throws IOException, InterruptedException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(RowSquareSumJob.class);
    job.setJobName(RowSquareSumJob.class.getSimpleName() + "-" + matrixOutputPath.getName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);

    int numReducers = 1;
    job.setNumReduceTasks(numReducers);//from ww  w  .  j  av a  2 s.  c  o m

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SumMapper.class);
    job.setCombinerClass(MergeVectorsReducer.class);
    job.setReducerClass(MergeVectorsReducer.class);

    //    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class,
    //        aRows);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}