Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass)

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF quads// w w  w.  j av a2  s  . c  om
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Quads Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(QuadGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(QuadWritable.class);
    job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(QuadsInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Quads Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF triple and/or quad inputs//from  www.j a va 2 s.  c  o m
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(QuadGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(QuadWritable.class);
    job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}

From source file:org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();/*from  w ww .  j a  v a  2s  . c  o m*/
    addOutputOption();
    addOption("recommenderClassName", "r", "Name of recommender class to instantiate");
    addOption("numRecommendations", "n", "Number of recommendations per user", "10");
    addOption("usersFile", "u", "Number of recommendations per user", null);

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path inputFile = getInputPath();
    Path outputPath = getOutputPath();
    Path usersFile = parsedArgs.get("--usersFile") == null ? inputFile
            : new Path(parsedArgs.get("--usersFile"));

    String recommendClassName = parsedArgs.get("--recommenderClassName");
    int recommendationsPerUser = Integer.parseInt(parsedArgs.get("--numRecommendations"));

    Job job = prepareJob(usersFile, outputPath, TextInputFormat.class, UserIDsMapper.class,
            VarLongWritable.class, NullWritable.class, RecommenderReducer.class, VarLongWritable.class,
            RecommendedItemsWritable.class, TextOutputFormat.class);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    Configuration jobConf = job.getConfiguration();
    jobConf.set(RecommenderReducer.RECOMMENDER_CLASS_NAME, recommendClassName);
    jobConf.setInt(RecommenderReducer.RECOMMENDATIONS_PER_USER, recommendationsPerUser);
    jobConf.set(RecommenderReducer.DATA_MODEL_FILE, inputFile.toString());

    job.waitForCompletion(true);
    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.slopeone.SlopeOneAverageDiffsJob.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();//from   w w w.  java  2  s.c  om
    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path prefsFile = getInputPath();
    Path outputPath = getOutputPath();
    Path averagesOutputPath = new Path(parsedArgs.get("--tempDir"));

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job prefsToDiffsJob = prepareJob(prefsFile, averagesOutputPath, TextInputFormat.class,
                ToItemPrefsMapper.class, VarLongWritable.class, EntityPrefWritable.class,
                SlopeOnePrefsToDiffsReducer.class, EntityEntityWritable.class, FloatWritable.class,
                SequenceFileOutputFormat.class);
        prefsToDiffsJob.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job diffsToAveragesJob = prepareJob(averagesOutputPath, outputPath, SequenceFileInputFormat.class,
                Mapper.class, EntityEntityWritable.class, FloatWritable.class,
                SlopeOneDiffsToAveragesReducer.class, EntityEntityWritable.class, FloatWritable.class,
                TextOutputFormat.class);
        FileOutputFormat.setOutputCompressorClass(diffsToAveragesJob, GzipCodec.class);
        diffsToAveragesJob.waitForCompletion(true);
    }
    return 0;
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BBtJob.java

License:Apache License

public static void run(Configuration conf, Path btPath, Path outputPath, int numReduceTasks)
        throws IOException, ClassNotFoundException, InterruptedException {

    Job job = new Job(conf);
    job.setJobName("BBt-job");
    job.setJarByClass(BBtJob.class);

    // input/*from  w w  w.j a  v a2s  .c  o m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, btPath);

    // map
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setMapperClass(BBtMapper.class);
    job.setReducerClass(BBtReducer.class);

    // combiner and reducer
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    // output
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BBT);

    // run
    job.submit();
    job.waitForCompletion(false);
    if (!job.isSuccessful()) {
        throw new IOException("BBt job failed.");
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath,
        int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast,
        Class<? extends Writable> labelClass, boolean outputBBtProducts)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
            labelClass, VectorWritable.class);

    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        /*/*from w  w  w  .  j  a  v a 2s.  c o  m*/
         * MAHOUT-1067: if we are asked to output BBT products then named vector
         * names should be propagated to Q too so that UJob could pick them up
         * from there.
         */
        oldApiJob.setBoolean(PROP_NV, true);
    }
    if (xiPath != null) {
        // compute pca -related stuff as well
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }

    /*
     * HACK: we use old api multiple outputs since they are not available in the
     * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
     * can use new api interfaces.
     */

    Job job = new Job(oldApiJob);
    job.setJobName("Bt-job");
    job.setJarByClass(BtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathA);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);

    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(QJob.PROP_P, p);
    job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
    job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);

    job.setNumReduceTasks(numReduceTasks);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
    }

    /*
     * we can broadhast Rhat files since all of them are reuqired by each job,
     * but not Q files which correspond to splits of A (so each split of A will
     * require only particular Q file, each time different one).
     */

    if (broadcast) {
        job.getConfiguration().set(PROP_RHAT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*"));
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Bt job unsuccessful.");
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.QJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPaths, Path sbPath, Path outputPath, int aBlockRows,
        int minSplitSize, int k, int p, long seed, int numReduceTasks)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);
    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_QHAT,
            org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class,
            DenseBlockWritable.class);
    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_RHAT,
            org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class,
            VectorWritable.class);

    Job job = new Job(oldApiJob);
    job.setJobName("Q-job");
    job.setJarByClass(QJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputPaths);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }/*  w w w .  j  a v a  2s. c o  m*/

    FileOutputFormat.setOutputPath(job, outputPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(SplitPartitionedWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(QMapper.class);

    job.getConfiguration().setInt(PROP_AROWBLOCK_SIZE, aBlockRows);
    job.getConfiguration().setLong(PROP_OMEGA_SEED, seed);
    job.getConfiguration().setInt(PROP_K, k);
    job.getConfiguration().setInt(PROP_P, p);
    if (sbPath != null) {
        job.getConfiguration().set(PROP_SB_PATH, sbPath.toString());
    }

    /*
     * number of reduce tasks doesn't matter. we don't actually send anything to
     * reducers.
     */

    job.setNumReduceTasks(0 /* numReduceTasks */);

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Q job unsuccessful.");
    }

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.UJob.java

License:Apache License

public void run(Configuration conf, Path inputPathQ, Path inputUHatPath, Path sigmaPath, Path outputPath, int k,
        int numReduceTasks, Class<? extends Writable> labelClass, SSVDSolver.OutputScalingEnum outputScaling)
        throws ClassNotFoundException, InterruptedException, IOException {

    job = new Job(conf);
    job.setJobName("U-job");
    job.setJarByClass(UJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathQ);
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_U);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapperClass(UMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(labelClass);//w  w  w .  j  a v a  2 s .c  om
    job.setOutputValueClass(VectorWritable.class);

    job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.getConfiguration().set(PROP_SIGMA_PATH, sigmaPath.toString());
    job.getConfiguration().set(PROP_OUTPUT_SCALING, outputScaling.name());
    job.getConfiguration().setInt(PROP_K, k);
    job.setNumReduceTasks(0);
    job.submit();

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.VJob.java

License:Apache License

/**
 * /*from  ww  w  .j a va2s.  c o  m*/
 * @param conf
 * @param inputPathBt
 * @param xiPath
 *          PCA row mean (MAHOUT-817, to fix B')
 * @param sqPath
 *          sq (MAHOUT-817, to fix B')
 * @param inputUHatPath
 * @param inputSigmaPath
 * @param outputPath
 * @param k
 * @param numReduceTasks
 * @param outputScaling output scaling: apply Sigma, or Sigma^0.5, or none
 * @throws ClassNotFoundException
 * @throws InterruptedException
 * @throws IOException
 */
public void run(Configuration conf, Path inputPathBt, Path xiPath, Path sqPath,

        Path inputUHatPath, Path inputSigmaPath,

        Path outputPath, int k, int numReduceTasks, SSVDSolver.OutputScalingEnum outputScaling)
        throws ClassNotFoundException, InterruptedException, IOException {

    job = new Job(conf);
    job.setJobName("V-job");
    job.setJarByClass(VJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathBt);
    FileOutputFormat.setOutputPath(job, outputPath);

    // Warn: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_V);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(VMapper.class);

    job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.getConfiguration().set(PROP_SIGMA_PATH, inputSigmaPath.toString());
    job.getConfiguration().set(PROP_OUTPUT_SCALING, outputScaling.name());
    job.getConfiguration().setInt(PROP_K, k);
    job.setNumReduceTasks(0);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
        job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString());
    }

    job.submit();

}

From source file:org.apache.pig.builtin.PigStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, "");
    FileOutputFormat.setOutputPath(job, new Path(location));

    if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) {
        FileOutputFormat.setCompressOutput(job, true);
        String codec = job.getConfiguration().get("output.compression.codec");
        try {/*from   ww w  .j  a  v a2s .  com*/
            FileOutputFormat.setOutputCompressorClass(job,
                    (Class<? extends CompressionCodec>) Class.forName(codec));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException("Class not found: " + codec);
        }
    } else {
        // This makes it so that storing to a directory ending with ".gz" or ".bz2" works.
        setCompression(new Path(location), job);
    }
}