Example usage for org.apache.hadoop.mapred FileOutputFormat setOutputCompressorClass

List of usage examples for org.apache.hadoop.mapred FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(JobConf conf, Class<? extends CompressionCodec> codecClass) 

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:de.tudarmstadt.lt.n2n.hadoop.FlipJoBims.java

License:Apache License

public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf(FlipJoBims.class);

    /* begin necessary for UKP cluster */
    conf.setMemoryForMapTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    conf.setMemoryForReduceTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    FileOutputFormat.setCompressOutput(conf, true); // compress output
    FileOutputFormat.setOutputCompressorClass(conf,
            org.apache.hadoop.io.compress.BZip2Codec.class); /* use the bzip2 codec for compression */
    conf.setCompressMapOutput(true); // compress mapper output
    /* end necessary for UKP cluster */

    conf.setJobName(FlipJoBims.class.getSimpleName());
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(FlipJoBims.Map.class);
    conf.setNumReduceTasks(0);/*from  w w  w  . jav  a  2  s.  com*/
    // conf.setReducerClass(IdentityReducer.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapOutputValueClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);

}

From source file:edu.uci.ics.hyracks.imru.util.DataBalancer.java

License:Apache License

public static void main(String[] args) throws IOException {
    JobConf job = new JobConf(DataBalancer.class);

    job.setJobName(DataBalancer.class.getSimpleName());
    job.setMapperClass(MapRecordOnly.class);
    job.setReducerClass(ReduceRecordOnly.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(Integer.parseInt(args[2]));

    if (args.length > 3) {
        if (args[3].startsWith("bzip"))
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        if (args[3].startsWith("gz"))
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }/*from   w w w .  j a v  a  2  s .  c o m*/
    JobClient.runJob(job);
}

From source file:edu.uci.ics.pregelix.core.util.DataGenerator.java

License:Apache License

public static void main(String[] args) throws IOException {

    JobConf job = new JobConf(DataGenerator.class);
    FileSystem dfs = FileSystem.get(job);
    String maxFile = "/maxtemp";
    dfs.delete(new Path(maxFile), true);

    job.setJobName(DataGenerator.class.getSimpleName() + "max ID");
    job.setMapperClass(MapMaxId.class);
    job.setCombinerClass(CombineMaxId.class);
    job.setReducerClass(ReduceMaxId.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(VLongWritable.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(maxFile));
    job.setNumReduceTasks(1);/*from   w ww  .ja v a  2s  . c  om*/
    JobClient.runJob(job);

    job = new JobConf(DataGenerator.class);
    job.set("hyracks.maxid.file", maxFile);
    job.setInt("hyracks.x", Integer.parseInt(args[2]));
    dfs.delete(new Path(args[1]), true);

    job.setJobName(DataGenerator.class.getSimpleName());
    job.setMapperClass(MapRecordGen.class);
    job.setReducerClass(ReduceRecordGen.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(Integer.parseInt(args[3]));

    if (args.length > 4) {
        if (args[4].startsWith("bzip"))
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        if (args[4].startsWith("gz"))
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }
    JobClient.runJob(job);
}

From source file:hibench.HiveDataGenerator.java

License:Apache License

private void createRankingsTable() throws IOException {

    LOG.info("Creating table rankings...");

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " rankings";

    job.setJobName(jobname);//from ww  w. j  a  v a2 s . c o m

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapOutputKeyClass(Text.class);

    job.setCombinerClass(ConcatTextCombiner.class);
    job.setReducerClass(CountRankingAndReplaceIdReducer.class);

    if (options.reds > 0) {
        job.setNumReduceTasks(options.reds);
    } else {
        job.setNumReduceTasks(DataOptions.getMaxNumReduce());
    }

    //      job.setNumReduceTasks(options.agents/2);

    /***
     * need to join result with LINK table so that to replace
     * url ids with real contents
     */
    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.T_LINK_PAGE), TextInputFormat.class,
            MyIdentityMapper.class);
    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.LINKS), TextInputFormat.class,
            TagRecordsMapper.class);

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");

        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.RANKINGS));

    LOG.info("Running Job: " + jobname);
    LOG.info("Table link-page file " + paths.getPath(DataPaths.T_LINK_PAGE) + " as input");
    LOG.info("Links file " + paths.getResult(DataPaths.LINKS) + " as output");
    LOG.info("Ouput file " + paths.getResult(DataPaths.RANKINGS));
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.RANKINGS));
}

From source file:hibench.HiveDataGenerator.java

License:Apache License

private void createUserVisitsTable() throws IOException, URISyntaxException {

    LOG.info("Creating user visits...");

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " uservisits";
    job.setJobName(jobname);/*from   w w w .  ja va  2s  .  c  o  m*/

    /***
     * Set distributed cache file for table generation,
     * cache files include:
     * 1. user agents
     * 2. country code and language code
     * 3. search keys
     */

    DistributedCache.addCacheFile(paths.getPath(DataPaths.uagentf).toUri(), job);
    DistributedCache.addCacheFile(paths.getPath(DataPaths.countryf).toUri(), job);
    DistributedCache.addCacheFile(paths.getPath(DataPaths.searchkeyf).toUri(), job);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapOutputKeyClass(Text.class);

    visit.setJobConf(job);

    job.setInputFormat(TextInputFormat.class);

    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.DUMMY), NLineInputFormat.class,
            CreateRandomAccessMapper.class);
    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.LINKS), TextInputFormat.class,
            TagRecordsMapper.class);

    job.setCombinerClass(CreateUserVisitsCombiner.class);
    job.setReducerClass(CreateUserVisitsReducer.class);

    if (options.reds > 0) {
        job.setNumReduceTasks(options.reds);
    } else {
        job.setNumReduceTasks(DataOptions.getMaxNumReduce());
    }

    //      job.setNumReduceTasks(options.agents/2);

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.USERVISITS));

    LOG.info("Running Job: " + jobname);
    LOG.info("Dummy file " + paths.getPath(DataPaths.DUMMY) + " as input");
    LOG.info("Links file " + paths.getResult(DataPaths.LINKS) + " as output");
    LOG.info("Ouput file " + paths.getResult(DataPaths.USERVISITS));
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.USERVISITS));
}

From source file:hibench.PageRankDataGenerator.java

License:Apache License

private void createPageRankNodes() throws IOException {

    LOG.info("Creating PageRank nodes...", null);

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " pagerank nodes";

    job.setJobName(jobname);//from   w  w  w. ja v  a 2  s .  c o m

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, paths.getPath(DataPaths.LINKS));
    job.setInputFormat(TextInputFormat.class);

    if (options.PAGERANK_NODE_BALANCE) {
        /***
         * Balance the output order of nodes, to prevent the running
         * of pagerank bench from potential data skew
         */
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setMapperClass(BalancedLinkNodesMapper.class);
        job.setReducerClass(BalancedLinkNodesReducer.class);
        //         job.setPartitionerClass(ModulusPartitioner.class);

        if (options.reds > 0) {
            job.setNumReduceTasks(options.reds);
        } else {
            job.setNumReduceTasks(DataOptions.getMaxNumReduce());
        }
    } else {
        job.setMapOutputKeyClass(Text.class);

        job.setMapperClass(OutputLinkNodesMapper.class);
        job.setNumReduceTasks(0);
    }

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.VERTICALS));

    LOG.info("Running Job: " + jobname);
    LOG.info("Links file " + paths.getPath(DataPaths.LINKS) + " as input");
    LOG.info("Vertices file " + paths.getResult(DataPaths.VERTICALS) + " as output");
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.VERTICALS));
}

From source file:hibench.PageRankDataGenerator.java

License:Apache License

/***
 * Create pagerank edge table, output link A->B as <A, B> pairs
 * @throws IOException/*  w  w  w .j  a  v  a2s . c o m*/
 */
private void createPageRankLinks() throws IOException {

    LOG.info("Creating PageRank links", null);

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " pagerank links";

    job.setJobName(jobname);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapOutputKeyClass(Text.class);

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, paths.getPath(DataPaths.T_LINK_PAGE));
    job.setInputFormat(TextInputFormat.class);

    job.setMapperClass(OutputLinkEdgesMapper.class);

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.EDGES));

    LOG.info("Running Job: " + jobname);
    LOG.info("Table link-page " + paths.getPath(DataPaths.T_LINK_PAGE) + " as input");
    LOG.info("Edges file " + paths.getResult(DataPaths.EDGES) + " as output");
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.EDGES));
}

From source file:nthu.scopelab.tsqr.ssvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p,
        int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis)
        throws Exception {
    boolean outputQ = true;

    String stages[] = reduceSchedule.split(",");

    JobConf job = new JobConf(conf, BtJob.class);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setInt(SCHEDULE_NUM, stages.length);
    job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight);
    job.setInt(QJob.PROP_K, k);/*from   w w  w .  java 2  s  . co m*/
    job.setInt(QJob.PROP_P, p);
    job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ);
    job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.set(QmultiplyJob.QRF_DIR, qrfPath);
    FileSystem.get(job).delete(btPath, true);

    FileOutputFormat.setOutputPath(job, btPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setJobName("BtJob");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    //job.setOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job));
    mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job));
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    //job.setNumReduceTasks(0);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);

    if (outputQ) {
        MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class,
                IntWritable.class, LMatrixWritable.class);
    }
    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }
    RunningJob rj = JobClient.runJob(job);
    System.out.println("Btjob Job ID: " + rj.getJobID().toString());
}

From source file:nthu.scopelab.tsqr.ssvd.itBtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p,
        int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis)
        throws Exception {
    boolean outputQ = true;

    String stages[] = reduceSchedule.split(",");

    JobConf job = new JobConf(conf, itBtJob.class);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setInt(SCHEDULE_NUM, stages.length);
    job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight);
    job.setInt(QJob.PROP_K, k);/* www  .  j  a  v  a  2s .  c o  m*/
    job.setInt(QJob.PROP_P, p);
    job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ);
    job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.set(QmultiplyJob.QRF_DIR, qrfPath);
    FileSystem.get(job).delete(btPath, true);

    FileOutputFormat.setOutputPath(job, btPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setJobName("itBtJob");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    //job.setOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job));
    mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job));
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    //job.setNumReduceTasks(0);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);

    if (outputQ) {
        MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class,
                IntWritable.class, LMatrixWritable.class);
    }
    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }
    RunningJob rj = JobClient.runJob(job);
    System.out.println("itBtJob Job ID: " + rj.getJobID().toString());
}

From source file:nthu.scopelab.tsqr.ssvd.UJob.java

License:Apache License

public void start(Configuration conf, Path inputPathQ, Path inputUHatPath, Path sigmaPath, Path outputPath,
        int k, boolean uHalfSigma, int mis) throws ClassNotFoundException, InterruptedException, IOException {
    String input = "";

    JobConf job = new JobConf(conf, UJob.class);
    jobclient = new JobClient(job);
    job.setJobName("UJob");
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapperClass(MultiplyMapper.class);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LMatrixWritable.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(LMatrixWritable.class);

    FileSystem fs = FileSystem.get(job);
    fileGather fgather = new fileGather(
            new Path(inputPathQ.toString().substring(0, inputPathQ.toString().lastIndexOf("/") - 1)), "Q-", fs);
    mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    job.setNumReduceTasks(0);//from  w w w  . ja v a  2s.co m
    job.set("mapreduce.output.basename", OUTPUT_U);
    job.set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.set(PROP_SIGMA_PATH, sigmaPath.toString());
    if (uHalfSigma) {
        job.set(PROP_U_HALFSIGMA, "y");
    }
    job.setInt(QJob.PROP_K, k);
    FileSystem.get(job).delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    FileInputFormat.setInputPaths(job, inputPathQ);
    //JobClient.runJob(job);
    jobid = jobclient.submitJob(job).getID();

}