Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass)

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:nthu.scopelab.tsqr.ssvd.ABtDenseOutJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPath, Path inputBt, Path outputPath, int k, int p,
        int reduceTasks, int mis) throws Exception {

    JobConf job = new JobConf(conf, ABtDenseOutJob.class);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setInt(QJob.PROP_K, k);/*w  w w.j  a  v a2 s  .co  m*/
    job.setInt(QJob.PROP_P, p);
    job.set(PROP_BT_PATH, inputBt.toString());

    FileOutputFormat.setOutputPath(job, outputPath);
    job.setJobName("ABtDenseOutJob");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(LMatrixWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(LMatrixWritable.class);

    job.setMapperClass(ABtMapper.class);

    fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job));
    mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job));
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, inputPath);

    RunningJob rj = JobClient.runJob(job);
}

From source file:nthu.scopelab.tsqr.ssvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p,
        int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis)
        throws Exception {
    boolean outputQ = true;

    String stages[] = reduceSchedule.split(",");

    JobConf job = new JobConf(conf, BtJob.class);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setInt(SCHEDULE_NUM, stages.length);
    job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight);
    job.setInt(QJob.PROP_K, k);/*from  w w  w.  j  a  v a  2  s. c  o  m*/
    job.setInt(QJob.PROP_P, p);
    job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ);
    job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.set(QmultiplyJob.QRF_DIR, qrfPath);
    FileSystem.get(job).delete(btPath, true);

    FileOutputFormat.setOutputPath(job, btPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setJobName("BtJob");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    //job.setOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job));
    mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job));
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    //job.setNumReduceTasks(0);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);

    if (outputQ) {
        MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class,
                IntWritable.class, LMatrixWritable.class);
    }
    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }
    RunningJob rj = JobClient.runJob(job);
    System.out.println("Btjob Job ID: " + rj.getJobID().toString());
}

From source file:nthu.scopelab.tsqr.ssvd.itBtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p,
        int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis)
        throws Exception {
    boolean outputQ = true;

    String stages[] = reduceSchedule.split(",");

    JobConf job = new JobConf(conf, itBtJob.class);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setInt(SCHEDULE_NUM, stages.length);
    job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight);
    job.setInt(QJob.PROP_K, k);//from   ww  w  . j a  v  a  2s. com
    job.setInt(QJob.PROP_P, p);
    job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ);
    job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.set(QmultiplyJob.QRF_DIR, qrfPath);
    FileSystem.get(job).delete(btPath, true);

    FileOutputFormat.setOutputPath(job, btPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setJobName("itBtJob");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    //job.setOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job));
    mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job));
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    //job.setNumReduceTasks(0);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);

    if (outputQ) {
        MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class,
                IntWritable.class, LMatrixWritable.class);
    }
    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }
    RunningJob rj = JobClient.runJob(job);
    System.out.println("itBtJob Job ID: " + rj.getJobID().toString());
}

From source file:nthu.scopelab.tsqr.ssvd.itQJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k,
        int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException {

    String stages[] = reduceSchedule.split(",");
    String rinput = "";
    String routput = outputPath + "/iter-r-";

    for (int i = 0; i < stages.length; i++) {
        String thenumber = Integer.toString(i + 1);
        JobConf job = new JobConf(conf, itQJob.class);
        job.setJobName("itQ-job-" + thenumber);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        if (i == 0)
            job.setMapperClass(QMapper.class);
        else//from  www .  j ava2  s . c om
            job.setMapperClass(IdentityMapper.class);

        job.setReducerClass(QReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LMatrixWritable.class);

        FileSystem fs = FileSystem.get(job);
        Path Paths[];
        fileGather fgather = null;
        if (i == 0)
            fgather = new fileGather(inputPaths, "part", fs);
        else
            fgather = new fileGather(new Path(rinput), "part", fs);
        Paths = fgather.getPaths();
        mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
        job.setNumMapTasks(fgather.recNumMapTasks(mis));

        job.setNumReduceTasks(Integer.parseInt(stages[i]));

        job.setInt(QRFirstJob.COLUMN_SIZE, k + p);
        job.setLong(PROP_OMEGA_SEED, seed);
        job.setInt(PROP_K, k);
        job.setInt(PROP_P, p);

        fs.delete(new Path(routput + thenumber), true);

        FileInputFormat.setInputPaths(job, Paths);

        FileOutputFormat.setOutputPath(job, new Path(routput + thenumber));

        //FileOutputFormat.setCompressOutput(job, true);
        //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
        //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);
        //output first level Q
        MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class,
                LMatrixWritable.class);

        RunningJob rj = JobClient.runJob(job);
        System.out.println("itQJob Job ID: " + rj.getJobID().toString());
        rinput = routput + thenumber;
    }
}

From source file:nthu.scopelab.tsqr.ssvd.QJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k,
        int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException {

    String stages[] = reduceSchedule.split(",");
    String rinput = "";
    String routput = outputPath + "/iter-r-";

    for (int i = 0; i < stages.length; i++) {
        String thenumber = Integer.toString(i + 1);
        JobConf job = new JobConf(conf, QJob.class);
        job.setJobName("Q-job-" + thenumber);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        if (i == 0)
            job.setMapperClass(QMapper.class);
        else/*from w  w w .j a  v a 2  s  . c om*/
            job.setMapperClass(IdentityMapper.class);

        job.setReducerClass(QReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LMatrixWritable.class);

        FileSystem fs = FileSystem.get(job);
        Path Paths[];
        fileGather fgather = null;
        if (i == 0)
            fgather = new fileGather(inputPaths, "part", fs);
        else
            fgather = new fileGather(new Path(rinput), "part", fs);
        Paths = fgather.getPaths();
        mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
        job.setNumMapTasks(fgather.recNumMapTasks(mis));

        job.setNumReduceTasks(Integer.parseInt(stages[i]));

        job.setInt(QRFirstJob.COLUMN_SIZE, k + p);
        job.setLong(PROP_OMEGA_SEED, seed);
        job.setInt(PROP_K, k);
        job.setInt(PROP_P, p);

        fs.delete(new Path(routput + thenumber), true);

        FileInputFormat.setInputPaths(job, Paths);

        FileOutputFormat.setOutputPath(job, new Path(routput + thenumber));

        //FileOutputFormat.setCompressOutput(job, true);
        //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
        //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);
        //output first level Q
        MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class,
                LMatrixWritable.class);

        RunningJob rj = JobClient.runJob(job);
        System.out.println("QJob Job ID: " + rj.getJobID().toString());
        rinput = routput + thenumber;
    }
}

From source file:nthu.scopelab.tsqr.ssvd.UJob.java

License:Apache License

public void start(Configuration conf, Path inputPathQ, Path inputUHatPath, Path sigmaPath, Path outputPath,
        int k, boolean uHalfSigma, int mis) throws ClassNotFoundException, InterruptedException, IOException {
    String input = "";

    JobConf job = new JobConf(conf, UJob.class);
    jobclient = new JobClient(job);
    job.setJobName("UJob");
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapperClass(MultiplyMapper.class);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LMatrixWritable.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(LMatrixWritable.class);

    FileSystem fs = FileSystem.get(job);
    fileGather fgather = new fileGather(
            new Path(inputPathQ.toString().substring(0, inputPathQ.toString().lastIndexOf("/") - 1)), "Q-", fs);
    mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    job.setNumReduceTasks(0);//w  w  w  .java 2 s .  c om
    job.set("mapreduce.output.basename", OUTPUT_U);
    job.set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.set(PROP_SIGMA_PATH, sigmaPath.toString());
    if (uHalfSigma) {
        job.set(PROP_U_HALFSIGMA, "y");
    }
    job.setInt(QJob.PROP_K, k);
    FileSystem.get(job).delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    FileInputFormat.setInputPaths(job, inputPathQ);
    //JobClient.runJob(job);
    jobid = jobclient.submitJob(job).getID();

}

From source file:org.acacia.csr.java.CSRConverter.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (!validArgs(args)) {
        printUsage();// w  ww. ja v  a2s  . co m
        return;
    }
    //These are the temp paths that are created on HDFS
    String dir1 = "/user/miyuru/csrconverter-output";
    String dir2 = "/user/miyuru/csrconverter-output-sorted";

    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    System.out.println("Deleting the dir : " + dir1);

    if (fs1.exists(new Path(dir1))) {
        fs1.delete(new Path(dir1), true);
    }

    System.out.println("Done deleting the dir : " + dir1);
    System.out.println("Deleting the dir : " + dir2);
    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }

    Path notinPath = new Path("/user/miyuru/notinverts/notinverts");

    if (!fs1.exists(notinPath)) {
        fs1.create(notinPath);
    }

    System.out.println("Done deleting the dir : " + dir2);

    //Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why.

    VertexCounterClient.setDefaultGraphID(args[3], args[2]);

    //First job creates the inverted index

    JobConf conf = new JobConf(CSRConverter.class);
    conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]);
    conf.set("org.acacia.partitioner.hbase.table", args[2]);
    conf.set("org.acacia.partitioner.hbase.contacthost", args[3]);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    //conf.setMapperClass(InvertedMapper.class);
    conf.setReducerClass(InvertedReducer.class);
    //conf.setInputFormat(TextInputFormat.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //FileInputFormat.setInputPaths(conf, new Path(args[0]));
    MultipleInputs.addInputPath(conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class);
    MultipleInputs.addInputPath(conf, new Path("/user/miyuru/notinverts/notinverts"), TextInputFormat.class,
            InvertedMapper.class);
    FileOutputFormat.setOutputPath(conf, new Path(dir1));

    //Also for the moment we turn-off the speculative execution
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setNumMapTasks(96);
    conf.setNumReduceTasks(96);
    conf.setPartitionerClass(VertexPartitioner.class);
    conf.set("vertex-count", args[4]);
    conf.set("zero-flag", args[5]);
    Job job = new Job(conf, "csr_inverter");
    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
}

From source file:org.acacia.csr.java.LineCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*//  ww  w  . j  a  va  2s .com
      String dir1 = "/user/miyuru/wcout";
      String dir2 = "/user/miyuru/lcout";
       //We first delete the temporary directories if they exist on the HDFS
        FileSystem fs1 = FileSystem.get(new JobConf());
                
       if(fs1.exists(new Path(dir2))){
          fs1.delete(new Path(dir2), true);
       }
            
        JobConf conf = new JobConf(LineCount.class);
        conf.setJobName("LineCount");
               
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(IntWritable.class);
               
        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);
               
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
               
        FileInputFormat.setInputPaths(conf, new Path(dir1));
        FileOutputFormat.setOutputPath(conf, new Path(dir2));
               
        Job job = new Job(conf, "line count");
        job.waitForCompletion(true); 
        org.apache.hadoop.mapreduce.Counters cntr = job.getCounters();
        System .out.println("Number of lines in the file" + cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue());
        */

    long edgeCount = 0;
    //String dir3 = "/user/miyuru/wcout";
    String dir4 = "/user/miyuru/lcout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs2 = FileSystem.get(new JobConf());

    if (fs2.exists(new Path(dir4))) {
        fs2.delete(new Path(dir4), true);
    }

    JobConf conf1 = new JobConf(LineCount.class);
    conf1.setJobName("LineCount");

    conf1.setOutputKeyClass(Text.class);
    conf1.setOutputValueClass(IntWritable.class);

    conf1.setMapperClass(Map.class);
    conf1.setCombinerClass(Reduce.class);
    conf1.setReducerClass(Reduce.class);

    conf1.setInputFormat(TextInputFormat.class);
    conf1.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf1, new Path(dir5));
    FileOutputFormat.setOutputPath(conf1, new Path(dir4));

    Job job1 = new Job(conf1, "line count");
    job1.setNumReduceTasks(0);
    job1.waitForCompletion(true);
    org.apache.hadoop.mapreduce.Counters cntr = job1.getCounters();
    edgeCount = cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();

    File efile = new File("/tmp/efile");

    if (efile.exists()) {
        efile.delete();
    }

    PrintWriter writer = new PrintWriter("/tmp/efile", "UTF-8");
    writer.println(edgeCount);
    writer.flush();
    writer.close();

    //edgeCount = edgeCount -1;//This is to remove the line number additionlly added to each edgelist file by HDFS. This is strange, but it happens.
    System.out.println("======>Edge count is : " + edgeCount);
    System.out.println("------Done Line Count---------------");
}

From source file:org.acacia.csr.java.NotInFinder.java

License:Apache License

public static void main(String[] args) throws Exception {
    String dir1 = "/user/miyuru/wcout";
    String dir2 = "/user/miyuru/notinverts";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }/* w w w  .java  2s.  c o  m*/

    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setMapperClass(TokenizerMapper.class);
    conf.setReducerClass(IntSumReducer.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(dir1));
    FileOutputFormat.setOutputPath(conf, new Path(dir2));
    Job job = new Job(conf, "NotInFinder");
    job.setJarByClass(WordCount.class);
    //   job.setMapperClass(TokenizerMapper.class);
    //   job.setCombinerClass(IntSumReducer.class);
    //   job.setReducerClass(IntSumReducer.class);
    //   job.setOutputKeyClass(LongWritable.class);
    //   job.setOutputValueClass(LongWritable.class);

    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);

}

From source file:org.acacia.partitioner.java.EdgeDistributor.java

License:Apache License

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    String dir1 = "/user/miyuru/input";
    String dir2 = "/user/miyuru/edgedistributed-out";

    //      //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());
    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }/*from  w  ww  .  ja v  a 2s  .  c  om*/

    //First job scans through the edge list and splits the edges in to separate files based on the partitioned vertex files.

    JobConf conf = new JobConf(EdgeDistributor.class);
    conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[0]);
    conf.set("org.acacia.partitioner.hbase.table", args[1]);
    conf.set("org.acacia.partitioner.index.contacthost", args[2]);
    conf.set("vert-count", args[3]);
    conf.set("initpartition-id", args[4]);
    conf.set("zero-flag", args[5]);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(FileMapper.class);
    conf.setReducerClass(FileReducer.class);
    //conf.setInputFormat(TextInputFormat.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setNumReduceTasks(96); //Need to specify the number of reduce tasks explicitly. Otherwise it creates only one reduce task.

    FileInputFormat.setInputPaths(conf, new Path(dir1));
    FileOutputFormat.setOutputPath(conf, new Path(dir2));

    MultipleOutputs.addMultiNamedOutput(conf, "partition", TextOutputFormat.class, NullWritable.class,
            Text.class);

    Job job = new Job(conf, "EdgeDistributor");
    job.waitForCompletion(true);

    System.out.println("Done job EdgeDistribution");
}