Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass)

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:gov.jgi.meta.pig.storage.FastaOutput.java

License:Open Source License

public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set("mapred.textoutputformat.separator", "");
    FileOutputFormat.setOutputPath(job, new Path(location));
    if (location.endsWith(".bz2")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (location.endsWith(".gz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }/*from w  w w .  j  ava  2 s .c o m*/
}

From source file:gr.ntua.h2rdf.LoadTriples.DistinctIds.java

License:Open Source License

public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException {
    //io.compression.codecs
    Job job = new Job();

    job.setInputFormatClass(TextInputFormat.class);
    Configuration conf = new Configuration();
    Path blockProjection = new Path("blockIds/");
    Path translations = new Path("translations/");
    Path sample = new Path("sample/");
    Path temp = new Path("temp/");
    Path uniqueIds = new Path("uniqueIds/");
    FileSystem fs;/*from   w w w .  java 2s .  co m*/
    try {
        fs = FileSystem.get(conf);
        if (fs.exists(uniqueIds)) {
            fs.delete(uniqueIds, true);
        }
        if (fs.exists(translations)) {
            fs.delete(translations, true);
        }
        if (fs.exists(blockProjection)) {
            fs.delete(blockProjection, true);
        }
        if (fs.exists(sample)) {
            fs.delete(sample, true);
        }
        if (fs.exists(temp)) {
            fs.delete(temp, true);
        }

        FileOutputFormat.setOutputPath(job, uniqueIds);
        Path inp = new Path(args[0]);
        FileInputFormat.setInputPaths(job, inp);

        double type = 1;
        double datasetSize = 0;
        if (fs.isFile(inp)) {
            datasetSize = fs.getFileStatus(inp).getLen();
        } else if (fs.isDirectory(inp)) {
            FileStatus[] s = fs.listStatus(inp);
            for (int i = 0; i < s.length; i++) {
                if (s[i].getPath().getName().toString().endsWith(".gz"))
                    type = 27;
                if (s[i].getPath().getName().toString().endsWith(".snappy"))
                    type = 10;
                datasetSize += s[i].getLen();
            }
        } else {
            FileStatus[] s = fs.globStatus(inp);
            for (int i = 0; i < s.length; i++) {
                if (s[i].getPath().getName().toString().endsWith(".gz"))
                    type = 27;
                if (s[i].getPath().getName().toString().endsWith(".snappy"))
                    type = 10;
                datasetSize += s[i].getLen();
            }
        }
        datasetSize = datasetSize * type;
        System.out.println("type: " + type);
        System.out.println("datasetSize: " + datasetSize);
        samplingRate = (double) sampleChunk / (double) datasetSize;
        if (samplingRate >= 0.1) {
            samplingRate = 0.1;
        }
        if (samplingRate <= 0.001) {
            samplingRate = 0.001;
        }
        numReducers = (int) (datasetSize / ReducerChunk);
        if (numReducers == 0)
            numReducers = 1;
        numReducers++;
    } catch (IOException e) {
        e.printStackTrace();
    }

    HBaseAdmin hadmin = new HBaseAdmin(conf);
    HTableDescriptor desc = new HTableDescriptor(TABLE_NAME);

    HColumnDescriptor family = new HColumnDescriptor("counter");
    desc.addFamily(family);
    if (!hadmin.tableExists(TABLE_NAME)) {
        hadmin.createTable(desc);
    }

    job.setNumReduceTasks(numReducers);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(ImmutableBytesWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(DistinctIds.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setPartitionerClass(SamplingPartitioner.class);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    job.getConfiguration().set("mapred.compress.map.output", "true");
    job.getConfiguration().set("mapred.map.output.compression.codec",
            "org.apache.hadoop.io.compress.SnappyCodec");

    //job.setCombinerClass(Combiner.class);
    job.setJobName("Distinct Id Wordcount");
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().setInt("io.sort.mb", 100);
    job.getConfiguration().setInt("io.file.buffer.size", 131072);
    job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);

    return job;

}

From source file:gr.ntua.h2rdf.loadTriples.Translate.java

License:Apache License

public static Job createSubmittableJob(String[] args) throws IOException {

    Job job = new Job();

    Configuration conf = job.getConfiguration();
    FileSystem fs;/* www.  j  a  v  a  2s.  co m*/
    int reducers = 0;
    try {
        fs = FileSystem.get(conf);
        FileStatus[] p = fs.listStatus(new Path("blockIds/"));
        reducers = p.length;
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setNumReduceTasks(reducers);

        Path out = new Path("translations");
        if (fs.exists(out)) {
            fs.delete(out, true);
        }
        FileOutputFormat.setOutputPath(job, out);
        FileInputFormat.addInputPath(job, new Path("temp"));

        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);

        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(ImmutableBytesWritable.class);
        job.setOutputKeyClass(ImmutableBytesWritable.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setJarByClass(Translate.class);

        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setPartitionerClass(IdPartitioner.class);

        job.setJobName("Translate");
        job.getConfiguration().set("mapred.compress.map.output", "true");
        job.getConfiguration().set("mapred.map.output.compression.codec",
                "org.apache.hadoop.io.compress.SnappyCodec");
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().setInt("io.sort.mb", 100);
        job.getConfiguration().setInt("io.file.buffer.size", 131072);
        job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);
        job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864);
        //job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432);

    } catch (IOException e) {
        e.printStackTrace();
    }
    return job;
}

From source file:nl.naward04.hadoop.country.Country.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = this.getConf();

    // Set compress type to compress BLOCKs (not RECORDs)
    // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
    // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html
    conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK");

    Job job = Job.getInstance(conf, "Find the country based on domain name or IP address.");
    job.setJarByClass(Country.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CountryLookup.class);
    job.setInputFormatClass(WarcInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Enable compression
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    // Execute job and return status
    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:nl.naward05.hadoop.MergeFiles.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = this.getConf();

    // Set compress type to compress BLOCKs (not RECORDs)
    // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
    // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html
    conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK");

    Job job = Job.getInstance(conf, "Merge countries and songs");
    job.setJarByClass(MergeFiles.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.addInputPath(job, new Path(args[1]));
    FileOutputFormat.setOutputPath(job, new Path(args[2]));

    job.setReducerClass(MergeReducer.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Enable compression
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    // Execute job and return status
    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:nl.surfsara.warcexamples.hadoop.rr.RR.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = this.getConf();

    conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK");

    Job job = Job.getInstance(conf, "Record Recognizer");
    job.setJarByClass(RR.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(RRMapper.class);
    job.setInputFormatClass(WarcInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    //      job.setOutputValueClass(LongWritable.class);   
    //      job.setReducerClass(LongSumReducer.class);

    // Enable compression
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    // Execute job and return status
    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:nl.utwente.mirex.AnchorExtract.java

License:Open Source License

/**
 * Runs the MapReduce job "anchor text extraction"
 * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts
 * @usage. //from w  w w. j a  v  a 2 s.c o m
 * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/&#x2a;/ /user/hadoop/ClueWeb09_Anchors </code> 
 */
public static void main(String[] args) throws Exception {
    // Set job configuration
    Configuration conf = new Configuration();
    conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout
    Job job = new Job(conf, "AnchorExtract");
    job.setJarByClass(AnchorExtract.class);

    if (args.length != 2) {
        System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName());
        System.out.println("          inputFiles: path to data");
        System.out.println("          outputFile: directory where anchor text is stored");
        System.exit(1);
    }
    int argc = 0;
    String inputFiles = args[argc++];
    String outputFile = args[argc++];

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(Combine.class);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(WarcFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list.
    FileOutputFormat.setOutputPath(job, new Path(outputFile));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.waitForCompletion(true);
}

From source file:nthu.scopelab.tsqr.ssvd.VJob.java

License:Apache License

public void start(Configuration conf, Path inputPathBt, Path inputUHatPath, Path inputSigmaPath,
        Path outputPath, int k, int numReduceTasks, int subRowSize, boolean vHalfSigma, int mis)
        throws ClassNotFoundException, InterruptedException, IOException {

    job = new Job(conf);
    job.setJobName("V-job");
    job.setJarByClass(VJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathBt);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    fileGather fgather = new fileGather(inputPathBt, "", fs);
    mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
    FileInputFormat.setMaxInputSplitSize(job, mis * 1024 * 1024);

    FileOutputFormat.setOutputPath(job, outputPath);

    // Warn: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_V);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(LMatrixWritable.class);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LMatrixWritable.class);

    job.setMapperClass(VMapper.class);

    job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.getConfiguration().set(PROP_SIGMA_PATH, inputSigmaPath.toString());
    if (vHalfSigma) {
        job.getConfiguration().set(PROP_V_HALFSIGMA, "y");
    }/*  w w  w  .  j  a  va 2 s  .  c  om*/
    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(SUB_ROW_SIZE, subRowSize);
    job.setNumReduceTasks(0);
    job.submit();
    //job.waitForCompletion(true);
}

From source file:org.apache.jena.grande.pig.RdfStorage.java

License:Apache License

@SuppressWarnings("unchecked")
@Override//from  w  w  w  .  j  a  v  a  2  s. co m
public void setStoreLocation(String location, Job job) throws IOException {
    log.debug("setStoreLocation({}, {})", location, job);
    job.getConfiguration().set("mapred.textoutputformat.separator", "");
    FileOutputFormat.setOutputPath(job, new Path(location));
    if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) {
        FileOutputFormat.setCompressOutput(job, true);
        String codec = job.getConfiguration().get("output.compression.codec");
        try {
            FileOutputFormat.setOutputCompressorClass(job,
                    (Class<? extends CompressionCodec>) Class.forName(codec));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException("Class not found: " + codec);
        }
    } else {
        if (location.endsWith(".bz2") || location.endsWith(".bz")) {
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        } else if (location.endsWith(".gz")) {
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        } else {
            FileOutputFormat.setCompressOutput(job, false);
        }
    }
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF triples//from   ww w.ja v  a 2s . c  o m
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Triples Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(TripleGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(TripleWritable.class);
    job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(TriplesInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Triples Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}