Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass) 

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF quads// w w  w.  j av a2  s  . c  om
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Quads Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(QuadGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(QuadWritable.class);
    job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(QuadsInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Quads Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF triple and/or quad inputs//from  www.j a va 2 s.  c  o m
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(QuadGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(QuadWritable.class);
    job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}

From source file:org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();/*from  w ww .  j a  v a  2s  . c  o m*/
    addOutputOption();
    addOption("recommenderClassName", "r", "Name of recommender class to instantiate");
    addOption("numRecommendations", "n", "Number of recommendations per user", "10");
    addOption("usersFile", "u", "Number of recommendations per user", null);

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path inputFile = getInputPath();
    Path outputPath = getOutputPath();
    Path usersFile = parsedArgs.get("--usersFile") == null ? inputFile
            : new Path(parsedArgs.get("--usersFile"));

    String recommendClassName = parsedArgs.get("--recommenderClassName");
    int recommendationsPerUser = Integer.parseInt(parsedArgs.get("--numRecommendations"));

    Job job = prepareJob(usersFile, outputPath, TextInputFormat.class, UserIDsMapper.class,
            VarLongWritable.class, NullWritable.class, RecommenderReducer.class, VarLongWritable.class,
            RecommendedItemsWritable.class, TextOutputFormat.class);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    Configuration jobConf = job.getConfiguration();
    jobConf.set(RecommenderReducer.RECOMMENDER_CLASS_NAME, recommendClassName);
    jobConf.setInt(RecommenderReducer.RECOMMENDATIONS_PER_USER, recommendationsPerUser);
    jobConf.set(RecommenderReducer.DATA_MODEL_FILE, inputFile.toString());

    job.waitForCompletion(true);
    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.slopeone.SlopeOneAverageDiffsJob.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();//from   w w w.  java  2  s.c  om
    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path prefsFile = getInputPath();
    Path outputPath = getOutputPath();
    Path averagesOutputPath = new Path(parsedArgs.get("--tempDir"));

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job prefsToDiffsJob = prepareJob(prefsFile, averagesOutputPath, TextInputFormat.class,
                ToItemPrefsMapper.class, VarLongWritable.class, EntityPrefWritable.class,
                SlopeOnePrefsToDiffsReducer.class, EntityEntityWritable.class, FloatWritable.class,
                SequenceFileOutputFormat.class);
        prefsToDiffsJob.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job diffsToAveragesJob = prepareJob(averagesOutputPath, outputPath, SequenceFileInputFormat.class,
                Mapper.class, EntityEntityWritable.class, FloatWritable.class,
                SlopeOneDiffsToAveragesReducer.class, EntityEntityWritable.class, FloatWritable.class,
                TextOutputFormat.class);
        FileOutputFormat.setOutputCompressorClass(diffsToAveragesJob, GzipCodec.class);
        diffsToAveragesJob.waitForCompletion(true);
    }
    return 0;
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BBtJob.java

License:Apache License

public static void run(Configuration conf, Path btPath, Path outputPath, int numReduceTasks)
        throws IOException, ClassNotFoundException, InterruptedException {

    Job job = new Job(conf);
    job.setJobName("BBt-job");
    job.setJarByClass(BBtJob.class);

    // input/*from  w w  w.j a  v a2s  .c  o m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, btPath);

    // map
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setMapperClass(BBtMapper.class);
    job.setReducerClass(BBtReducer.class);

    // combiner and reducer
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    // output
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BBT);

    // run
    job.submit();
    job.waitForCompletion(false);
    if (!job.isSuccessful()) {
        throw new IOException("BBt job failed.");
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath,
        int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast,
        Class<? extends Writable> labelClass, boolean outputBBtProducts)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
            labelClass, VectorWritable.class);

    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        /*/*from w  w  w  .  j  a  v a 2s.  c o  m*/
         * MAHOUT-1067: if we are asked to output BBT products then named vector
         * names should be propagated to Q too so that UJob could pick them up
         * from there.
         */
        oldApiJob.setBoolean(PROP_NV, true);
    }
    if (xiPath != null) {
        // compute pca -related stuff as well
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }

    /*
     * HACK: we use old api multiple outputs since they are not available in the
     * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
     * can use new api interfaces.
     */

    Job job = new Job(oldApiJob);
    job.setJobName("Bt-job");
    job.setJarByClass(BtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathA);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);

    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(QJob.PROP_P, p);
    job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
    job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);

    job.setNumReduceTasks(numReduceTasks);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
    }

    /*
     * we can broadhast Rhat files since all of them are reuqired by each job,
     * but not Q files which correspond to splits of A (so each split of A will
     * require only particular Q file, each time different one).
     */

    if (broadcast) {
        job.getConfiguration().set(PROP_RHAT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*"));
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Bt job unsuccessful.");
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.QJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPaths, Path sbPath, Path outputPath, int aBlockRows,
        int minSplitSize, int k, int p, long seed, int numReduceTasks)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);
    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_QHAT,
            org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class,
            DenseBlockWritable.class);
    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_RHAT,
            org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class,
            VectorWritable.class);

    Job job = new Job(oldApiJob);
    job.setJobName("Q-job");
    job.setJarByClass(QJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputPaths);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }/*  w w w .  j  a v a  2s. c o  m*/

    FileOutputFormat.setOutputPath(job, outputPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(SplitPartitionedWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(QMapper.class);

    job.getConfiguration().setInt(PROP_AROWBLOCK_SIZE, aBlockRows);
    job.getConfiguration().setLong(PROP_OMEGA_SEED, seed);
    job.getConfiguration().setInt(PROP_K, k);
    job.getConfiguration().setInt(PROP_P, p);
    if (sbPath != null) {
        job.getConfiguration().set(PROP_SB_PATH, sbPath.toString());
    }

    /*
     * number of reduce tasks doesn't matter. we don't actually send anything to
     * reducers.
     */

    job.setNumReduceTasks(0 /* numReduceTasks */);

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Q job unsuccessful.");
    }

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.UJob.java

License:Apache License

public void run(Configuration conf, Path inputPathQ, Path inputUHatPath, Path sigmaPath, Path outputPath, int k,
        int numReduceTasks, Class<? extends Writable> labelClass, SSVDSolver.OutputScalingEnum outputScaling)
        throws ClassNotFoundException, InterruptedException, IOException {

    job = new Job(conf);
    job.setJobName("U-job");
    job.setJarByClass(UJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathQ);
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_U);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapperClass(UMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(labelClass);//w  w  w .  j  a v a  2 s .c  om
    job.setOutputValueClass(VectorWritable.class);

    job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.getConfiguration().set(PROP_SIGMA_PATH, sigmaPath.toString());
    job.getConfiguration().set(PROP_OUTPUT_SCALING, outputScaling.name());
    job.getConfiguration().setInt(PROP_K, k);
    job.setNumReduceTasks(0);
    job.submit();

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.VJob.java

License:Apache License

/**
 * /*from  ww  w  .j a va2s.  c o  m*/
 * @param conf
 * @param inputPathBt
 * @param xiPath
 *          PCA row mean (MAHOUT-817, to fix B')
 * @param sqPath
 *          sq (MAHOUT-817, to fix B')
 * @param inputUHatPath
 * @param inputSigmaPath
 * @param outputPath
 * @param k
 * @param numReduceTasks
 * @param outputScaling output scaling: apply Sigma, or Sigma^0.5, or none
 * @throws ClassNotFoundException
 * @throws InterruptedException
 * @throws IOException
 */
public void run(Configuration conf, Path inputPathBt, Path xiPath, Path sqPath,

        Path inputUHatPath, Path inputSigmaPath,

        Path outputPath, int k, int numReduceTasks, SSVDSolver.OutputScalingEnum outputScaling)
        throws ClassNotFoundException, InterruptedException, IOException {

    job = new Job(conf);
    job.setJobName("V-job");
    job.setJarByClass(VJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathBt);
    FileOutputFormat.setOutputPath(job, outputPath);

    // Warn: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_V);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(VMapper.class);

    job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.getConfiguration().set(PROP_SIGMA_PATH, inputSigmaPath.toString());
    job.getConfiguration().set(PROP_OUTPUT_SCALING, outputScaling.name());
    job.getConfiguration().setInt(PROP_K, k);
    job.setNumReduceTasks(0);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
        job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString());
    }

    job.submit();

}

From source file:org.apache.pig.builtin.PigStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, "");
    FileOutputFormat.setOutputPath(job, new Path(location));

    if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) {
        FileOutputFormat.setCompressOutput(job, true);
        String codec = job.getConfiguration().get("output.compression.codec");
        try {/*from   ww w  .j  a  v a2s .  com*/
            FileOutputFormat.setOutputCompressorClass(job,
                    (Class<? extends CompressionCodec>) Class.forName(codec));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException("Class not found: " + codec);
        }
    } else {
        // This makes it so that storing to a directory ending with ".gz" or ".bz2" works.
        setCompression(new Path(location), job);
    }
}