Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass) 

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:org.apache.pig.builtin.PigStorage.java

License:Apache License

private void setCompression(Path path, Job job) {
    String location = path.getName();
    if (location.endsWith(".bz2") || location.endsWith(".bz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (location.endsWith(".gz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {//from   w w w  .j  a  v  a  2 s .co m
        FileOutputFormat.setCompressOutput(job, false);
    }
}

From source file:org.apache.pig.piggybank.storage.GAMultiStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set("mapred.textoutputformat.separator", "");
    FileOutputFormat.setOutputPath(job, new Path(location));
    if (comp == Compression.bz2 || comp == Compression.bz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (comp == Compression.gz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }//w w w  .  j  a v  a  2  s .c  o m
}

From source file:org.apache.pig.piggybank.storage.MultiJsonStorage2.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set("mapred.textoutputformat.separator", "");
    //TODO: strip date path
    /*String[] parts = location.split("/");
    String[] rootParts = (String[])ArrayUtils.subarray(parts, 1, parts.length - 3);
    String newLocation = StringUtils.join(rootParts, "/");
    *///from ww  w. jav a 2  s .  co m
    FileOutputFormat.setOutputPath(job, new Path(location));
    if (comp == Compression.bz2 || comp == Compression.bz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (comp == Compression.gz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }
}

From source file:org.apache.pig.piggybank.storage.MultiStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, "");
    FileOutputFormat.setOutputPath(job, new Path(location));
    if (comp == Compression.bz2 || comp == Compression.bz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (comp == Compression.gz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }// ww  w. j  a  v a  2  s .  c o m
}

From source file:org.apache.sqoop.mapreduce.ImportJobBase.java

License:Apache License

/**
 * Configure the output format to use for the job.
 *///from  ww  w  . ja  va2  s . co m
@Override
protected void configureOutputFormat(Job job, String tableName, String tableClassName)
        throws ClassNotFoundException, IOException {

    job.setOutputFormatClass(getOutputFormatClass());

    if (isHCatJob) {
        LOG.debug("Configuring output format for HCatalog  import job");
        SqoopHCatUtilities.configureImportOutputFormat(options, job, getContext().getConnManager(), tableName,
                job.getConfiguration());
        return;
    }

    if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
        job.getConfiguration().set("mapred.output.value.class", tableClassName);
    }

    if (options.shouldUseCompression()) {
        FileOutputFormat.setCompressOutput(job, true);

        String codecName = options.getCompressionCodec();
        Class<? extends CompressionCodec> codecClass;
        if (codecName == null) {
            codecClass = GzipCodec.class;
        } else {
            Configuration conf = job.getConfiguration();
            codecClass = CodecMap.getCodec(codecName, conf).getClass();
        }
        FileOutputFormat.setOutputCompressorClass(job, codecClass);

        if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
            SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
        }

        // SQOOP-428: Avro expects not a fully qualified class name but a "short"
        // name instead (e.g. "snappy") and it needs to be set in a custom
        // configuration option called "avro.output.codec".
        // The default codec is "deflate".
        if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
            if (codecName != null) {
                String shortName = CodecMap.getCodecShortNameByName(codecName, job.getConfiguration());
                // Avro only knows about "deflate" and not "default"
                if (shortName.equalsIgnoreCase("default")) {
                    shortName = "deflate";
                }
                job.getConfiguration().set(AvroJob.OUTPUT_CODEC, shortName);
            } else {
                job.getConfiguration().set(AvroJob.OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC);
            }
        }
    }

    Path outputPath = context.getDestination();
    FileOutputFormat.setOutputPath(job, outputPath);
}

From source file:org.archive.wayback.hadoop.CDXSortDriver.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*  w  w w  . j a  v  a2 s .c o  m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    String delim = " ";

    long desiredMaps = 10;
    boolean compressOutput = false;
    boolean compressedInput = false;
    boolean gzipRange = false;
    List<String> otherArgs = new ArrayList<String>();
    int mapMode = CDXCanonicalizingMapper.MODE_FULL;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                desiredMaps = Integer.parseInt(args[++i]);
            } else if ("--compress-output".equals(args[i])) {
                compressOutput = true;
            } else if ("--compressed-input".equals(args[i])) {
                compressedInput = true;
            } else if ("--gzip-range".equals(args[i])) {
                gzipRange = true;
            } else if ("--delimiter".equals(args[i])) {
                delim = args[++i];
            } else if ("--map-full".equals(args[i])) {
                mapMode = CDXCanonicalizingMapper.MODE_FULL;
            } else if ("--map-global".equals(args[i])) {
                mapMode = CDXCanonicalizingMapper.MODE_GLOBAL;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 3 parameters left: split input output
    if (otherArgs.size() != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
        return printUsage();
    }

    String splitPathString = otherArgs.get(0);
    String inputPathString = otherArgs.get(1);
    String outputPathString = otherArgs.get(2);

    Path splitPath = new Path(splitPathString);
    Path inputPath = new Path(inputPathString);
    Path outputPath = new Path(outputPathString);

    Job job = new Job(getConf(), "cdx-sort");
    Configuration conf = job.getConfiguration();
    job.setJarByClass(CDXSortDriver.class);

    job.setMapperClass(CDXCanonicalizingMapper.class);

    job.setReducerClass(CDXReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // configure the "map mode"
    CDXCanonicalizingMapper.setMapMode(conf, mapMode);

    // set up the delimter:
    conf.set(TEXT_OUTPUT_DELIM_CONFIG, delim);

    if (compressOutput) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }

    // set up the Partitioner, including number of reduce tasks:
    FileSystem fs = inputPath.getFileSystem(conf);

    int splitCount = countLinesInPath(splitPath, conf);
    System.err.println("Split/Reduce count:" + splitCount);
    job.setNumReduceTasks(splitCount);

    AlphaPartitioner.setPartitionPath(conf, splitPathString);
    job.setPartitionerClass(AlphaPartitioner.class);

    // calculate the byte size to get the correct number of map tasks:
    FileStatus inputStatus = fs.getFileStatus(inputPath);
    long inputLen = inputStatus.getLen();
    long bytesPerMap = (int) inputLen / desiredMaps;

    FileInputFormat.addInputPath(job, inputPath);
    FileInputFormat.setMaxInputSplitSize(job, bytesPerMap);
    if (gzipRange) {
        job.setInputFormatClass(GZIPRangeLineDereferencingInputFormat.class);
    } else {
        job.setInputFormatClass(LineDereferencingInputFormat.class);
        if (compressedInput) {
            LineDereferencingRecordReader.forceCompressed(conf);
        }
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java

License:Open Source License

public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    SortVcfOptions options = new SortVcfOptions(args);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat());
    conf.setBoolean("hadoopbam.vcf.write-header", false);

    Path inputPath = new Path(options.getInput());
    FileSystem fs = inputPath.getFileSystem(conf);
    FileStatus[] files = fs.listStatus(inputPath);

    Path vcfHeaderPath = files[0].getPath();
    if (options.getVcfHeader() != null)
        vcfHeaderPath = new Path(options.getVcfHeader());

    if (files.length <= 0) {
        System.err.println("Input dir is empty!");
        return 1;
    }// ww w .  j  av a2  s  . c  o m
    conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString());
    conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName());

    KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf);

    baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf));
    VCFHeader vcfHeader = baseOF.getHeader();

    Job job = Job.getInstance(conf, "VCFSort");
    job.setJarByClass(SortVcf.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortVcfReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setOutputFormatClass(MyVCFOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(options.getReducerNum());

    SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
    String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date());
    Path partTmp = new Path(tmpDir + "/temp");
    VCFInputFormat.addInputPath(job, inputPath);
    if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job))
        VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE);
    FileOutputFormat.setOutputPath(job, partTmp);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class);

    Path partitionFile;
    if (options.getPartitionFileString() == null) {
        partitionFile = new Path(tmpDir + "/_partitons.lst");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        System.out.println("vcf-sort :: Sampling...");
        int numSamples = options.getNumSamples();
        if (fs.getContentSummary(inputPath).getLength() < 10000000) {
            numSamples = 1;
            job.setNumReduceTasks(1);
        }
        InputSampler.writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples,
                        numSamples));

    } else {
        System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ...");
        partitionFile = new Path(options.getPartitionFileString());
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
    }

    if (!job.waitForCompletion(true)) {
        System.err.println("sort :: Job failed.");
        return 1;
    }

    final FileSystem srcFS = partTmp.getFileSystem(conf);
    Path headerPath = new Path(tmpDir + "/header.vcf.gz");
    BGZFCodec bgzfCodec = new BGZFCodec();
    OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath));
    VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
    VariantContextWriter writer;
    writer = builder.setOutputVCFStream(new FilterOutputStream(os) {
        @Override
        public void close() throws IOException {
            this.out.flush();
        }
    }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

    writer.writeHeader(vcfHeader);
    os.close();

    Path outputPath = new Path(options.getOutput());
    final FileSystem dstFS = outputPath.getFileSystem(conf);
    OutputStream vcfgz = dstFS.create(outputPath);
    final FSDataInputStream headerIns = srcFS.open(headerPath);
    IOUtils.copyBytes(headerIns, vcfgz, conf, false);
    headerIns.close();

    final FileStatus[] parts = partTmp.getFileSystem(conf)
            .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*"));
    for (FileStatus p : parts) {
        final FSDataInputStream ins = srcFS.open(p.getPath());
        IOUtils.copyBytes(ins, vcfgz, conf, false);
        ins.close();
    }
    vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    vcfgz.close();
    partTmp.getFileSystem(conf).delete(partTmp, true);
    return 0;
}

From source file:org.opencb.hpg.bigdata.tools.alignment.Bam2AvroMR.java

License:Apache License

public static int run(String input, String output, String codecName, boolean adjustQuality, Configuration conf)
        throws Exception {

    // read header, and save sequence index/name in conf
    final Path p = new Path(input);
    final SeekableStream seekableStream = WrapSeekable.openPath(conf, p);
    final SamReader reader = SamReaderFactory.make().open(SamInputResource.of(seekableStream));
    final SAMFileHeader header = reader.getFileHeader();
    int i = 0;/*from ww w. j a va2  s .co  m*/
    SAMSequenceRecord sr;
    while ((sr = header.getSequence(i)) != null) {
        conf.set("" + i, sr.getSequenceName());
        i++;
    }

    Job job = Job.getInstance(conf, "Bam2AvroMR");
    job.setJarByClass(Bam2AvroMR.class);

    // Avro problem fix
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");
    job.getConfiguration().set(ADJUST_QUALITY, Boolean.toString(adjustQuality));

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, ReadAlignment.getClassSchema());
    job.setOutputValueClass(NullWritable.class);
    AvroJob.setMapOutputValueSchema(job, ReadAlignment.getClassSchema());

    // point to input data
    FileInputFormat.setInputPaths(job, new Path(input));
    job.setInputFormatClass(AnySAMInputFormat.class);

    // set the output format
    FileOutputFormat.setOutputPath(job, new Path(output));
    if (codecName != null) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName));
    }
    job.setOutputFormatClass(AvroKeyOutputFormat.class);

    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(Void.class);

    job.setMapperClass(Bam2GaMapper.class);
    job.setNumReduceTasks(0);

    job.waitForCompletion(true);

    // write header
    Path headerPath = new Path(output + ".header");
    FileSystem fs = FileSystem.get(conf);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(headerPath, true)));
    br.write(header.getTextHeader());
    br.close();

    return 0;
}

From source file:org.opencb.hpg.bigdata.tools.sequence.Fastq2AvroMR.java

License:Apache License

public static int run(String input, String output, String codecName) throws Exception {
    Configuration conf = new Configuration();

    Job job = Job.getInstance(conf, "Fastq2AvroMR");
    job.setJarByClass(Fastq2AvroMR.class);

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, Read.SCHEMA$);
    job.setOutputValueClass(NullWritable.class);
    AvroJob.setMapOutputValueSchema(job, Read.SCHEMA$);

    // point to input data
    FileInputFormat.setInputPaths(job, new Path(input));
    job.setInputFormatClass(FastqInputFormatMODIF.class);

    // set the output format
    FileOutputFormat.setOutputPath(job, new Path(output));
    if (codecName != null) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName));
    }/*from w  ww  . j  a va2s.c  om*/
    job.setOutputFormatClass(AvroKeyOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(AvroValue.class);

    job.setMapperClass(Fastq2GaMapper.class);
    job.setReducerClass(Fastq2GaReducer.class);

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.seqdoop.hadoop_bam.TestVCFRoundTrip.java

License:Open Source License

private Path doMapReduce(final Path inputPath, final boolean writeHeader) throws Exception {
    final FileSystem fileSystem = FileSystem.get(conf);
    final Path outputPath = fileSystem.makeQualified(new Path("target/out"));
    fileSystem.delete(outputPath, true);

    final Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, inputPath);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(VariantContextWritable.class);

    job.setOutputFormatClass(// w w  w  .ja  v  a 2  s. c  o  m
            writeHeader ? VCFTestWithHeaderOutputFormat.class : VCFTestNoHeaderOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setNumReduceTasks(0);
    FileOutputFormat.setOutputPath(job, outputPath);
    if (codecClass != null) {
        FileOutputFormat.setOutputCompressorClass(job, codecClass);
    }

    final boolean success = job.waitForCompletion(true);
    assertTrue(success);

    return outputPath;
}