Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass)

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:org.apache.pig.builtin.PigStorage.java

License:Apache License

private void setCompression(Path path, Job job) {
    String location = path.getName();
    if (location.endsWith(".bz2") || location.endsWith(".bz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (location.endsWith(".gz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {//from   w w w  .j  a  v  a  2 s .co m
        FileOutputFormat.setCompressOutput(job, false);
    }
}

From source file:org.apache.pig.piggybank.storage.GAMultiStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set("mapred.textoutputformat.separator", "");
    FileOutputFormat.setOutputPath(job, new Path(location));
    if (comp == Compression.bz2 || comp == Compression.bz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (comp == Compression.gz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }//w w w  .  j  a v  a  2  s .c  o m
}

From source file:org.apache.pig.piggybank.storage.MultiJsonStorage2.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set("mapred.textoutputformat.separator", "");
    //TODO: strip date path
    /*String[] parts = location.split("/");
    String[] rootParts = (String[])ArrayUtils.subarray(parts, 1, parts.length - 3);
    String newLocation = StringUtils.join(rootParts, "/");
    *///from ww  w. jav a 2  s .  co m
    FileOutputFormat.setOutputPath(job, new Path(location));
    if (comp == Compression.bz2 || comp == Compression.bz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (comp == Compression.gz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }
}

From source file:org.apache.pig.piggybank.storage.MultiStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, "");
    FileOutputFormat.setOutputPath(job, new Path(location));
    if (comp == Compression.bz2 || comp == Compression.bz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    } else if (comp == Compression.gz) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }// ww  w. j  a  v a  2  s .  c o m
}

From source file:org.apache.sqoop.mapreduce.ImportJobBase.java

License:Apache License

/**
 * Configure the output format to use for the job.
 *///from  ww  w  . ja  va2  s . co m
@Override
protected void configureOutputFormat(Job job, String tableName, String tableClassName)
        throws ClassNotFoundException, IOException {

    job.setOutputFormatClass(getOutputFormatClass());

    if (isHCatJob) {
        LOG.debug("Configuring output format for HCatalog  import job");
        SqoopHCatUtilities.configureImportOutputFormat(options, job, getContext().getConnManager(), tableName,
                job.getConfiguration());
        return;
    }

    if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
        job.getConfiguration().set("mapred.output.value.class", tableClassName);
    }

    if (options.shouldUseCompression()) {
        FileOutputFormat.setCompressOutput(job, true);

        String codecName = options.getCompressionCodec();
        Class<? extends CompressionCodec> codecClass;
        if (codecName == null) {
            codecClass = GzipCodec.class;
        } else {
            Configuration conf = job.getConfiguration();
            codecClass = CodecMap.getCodec(codecName, conf).getClass();
        }
        FileOutputFormat.setOutputCompressorClass(job, codecClass);

        if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
            SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
        }

        // SQOOP-428: Avro expects not a fully qualified class name but a "short"
        // name instead (e.g. "snappy") and it needs to be set in a custom
        // configuration option called "avro.output.codec".
        // The default codec is "deflate".
        if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
            if (codecName != null) {
                String shortName = CodecMap.getCodecShortNameByName(codecName, job.getConfiguration());
                // Avro only knows about "deflate" and not "default"
                if (shortName.equalsIgnoreCase("default")) {
                    shortName = "deflate";
                }
                job.getConfiguration().set(AvroJob.OUTPUT_CODEC, shortName);
            } else {
                job.getConfiguration().set(AvroJob.OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC);
            }
        }
    }

    Path outputPath = context.getDestination();
    FileOutputFormat.setOutputPath(job, outputPath);
}

From source file:org.archive.wayback.hadoop.CDXSortDriver.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*  w  w w  . j a  v  a2 s .c o  m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    String delim = " ";

    long desiredMaps = 10;
    boolean compressOutput = false;
    boolean compressedInput = false;
    boolean gzipRange = false;
    List<String> otherArgs = new ArrayList<String>();
    int mapMode = CDXCanonicalizingMapper.MODE_FULL;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                desiredMaps = Integer.parseInt(args[++i]);
            } else if ("--compress-output".equals(args[i])) {
                compressOutput = true;
            } else if ("--compressed-input".equals(args[i])) {
                compressedInput = true;
            } else if ("--gzip-range".equals(args[i])) {
                gzipRange = true;
            } else if ("--delimiter".equals(args[i])) {
                delim = args[++i];
            } else if ("--map-full".equals(args[i])) {
                mapMode = CDXCanonicalizingMapper.MODE_FULL;
            } else if ("--map-global".equals(args[i])) {
                mapMode = CDXCanonicalizingMapper.MODE_GLOBAL;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 3 parameters left: split input output
    if (otherArgs.size() != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
        return printUsage();
    }

    String splitPathString = otherArgs.get(0);
    String inputPathString = otherArgs.get(1);
    String outputPathString = otherArgs.get(2);

    Path splitPath = new Path(splitPathString);
    Path inputPath = new Path(inputPathString);
    Path outputPath = new Path(outputPathString);

    Job job = new Job(getConf(), "cdx-sort");
    Configuration conf = job.getConfiguration();
    job.setJarByClass(CDXSortDriver.class);

    job.setMapperClass(CDXCanonicalizingMapper.class);

    job.setReducerClass(CDXReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // configure the "map mode"
    CDXCanonicalizingMapper.setMapMode(conf, mapMode);

    // set up the delimter:
    conf.set(TEXT_OUTPUT_DELIM_CONFIG, delim);

    if (compressOutput) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }

    // set up the Partitioner, including number of reduce tasks:
    FileSystem fs = inputPath.getFileSystem(conf);

    int splitCount = countLinesInPath(splitPath, conf);
    System.err.println("Split/Reduce count:" + splitCount);
    job.setNumReduceTasks(splitCount);

    AlphaPartitioner.setPartitionPath(conf, splitPathString);
    job.setPartitionerClass(AlphaPartitioner.class);

    // calculate the byte size to get the correct number of map tasks:
    FileStatus inputStatus = fs.getFileStatus(inputPath);
    long inputLen = inputStatus.getLen();
    long bytesPerMap = (int) inputLen / desiredMaps;

    FileInputFormat.addInputPath(job, inputPath);
    FileInputFormat.setMaxInputSplitSize(job, bytesPerMap);
    if (gzipRange) {
        job.setInputFormatClass(GZIPRangeLineDereferencingInputFormat.class);
    } else {
        job.setInputFormatClass(LineDereferencingInputFormat.class);
        if (compressedInput) {
            LineDereferencingRecordReader.forceCompressed(conf);
        }
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java

License:Open Source License

public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    SortVcfOptions options = new SortVcfOptions(args);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat());
    conf.setBoolean("hadoopbam.vcf.write-header", false);

    Path inputPath = new Path(options.getInput());
    FileSystem fs = inputPath.getFileSystem(conf);
    FileStatus[] files = fs.listStatus(inputPath);

    Path vcfHeaderPath = files[0].getPath();
    if (options.getVcfHeader() != null)
        vcfHeaderPath = new Path(options.getVcfHeader());

    if (files.length <= 0) {
        System.err.println("Input dir is empty!");
        return 1;
    }// ww w .  j  av a2  s  . c  o m
    conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString());
    conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName());

    KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf);

    baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf));
    VCFHeader vcfHeader = baseOF.getHeader();

    Job job = Job.getInstance(conf, "VCFSort");
    job.setJarByClass(SortVcf.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortVcfReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setOutputFormatClass(MyVCFOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(options.getReducerNum());

    SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
    String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date());
    Path partTmp = new Path(tmpDir + "/temp");
    VCFInputFormat.addInputPath(job, inputPath);
    if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job))
        VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE);
    FileOutputFormat.setOutputPath(job, partTmp);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class);

    Path partitionFile;
    if (options.getPartitionFileString() == null) {
        partitionFile = new Path(tmpDir + "/_partitons.lst");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        System.out.println("vcf-sort :: Sampling...");
        int numSamples = options.getNumSamples();
        if (fs.getContentSummary(inputPath).getLength() < 10000000) {
            numSamples = 1;
            job.setNumReduceTasks(1);
        }
        InputSampler.writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples,
                        numSamples));

    } else {
        System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ...");
        partitionFile = new Path(options.getPartitionFileString());
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
    }

    if (!job.waitForCompletion(true)) {
        System.err.println("sort :: Job failed.");
        return 1;
    }

    final FileSystem srcFS = partTmp.getFileSystem(conf);
    Path headerPath = new Path(tmpDir + "/header.vcf.gz");
    BGZFCodec bgzfCodec = new BGZFCodec();
    OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath));
    VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
    VariantContextWriter writer;
    writer = builder.setOutputVCFStream(new FilterOutputStream(os) {
        @Override
        public void close() throws IOException {
            this.out.flush();
        }
    }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

    writer.writeHeader(vcfHeader);
    os.close();

    Path outputPath = new Path(options.getOutput());
    final FileSystem dstFS = outputPath.getFileSystem(conf);
    OutputStream vcfgz = dstFS.create(outputPath);
    final FSDataInputStream headerIns = srcFS.open(headerPath);
    IOUtils.copyBytes(headerIns, vcfgz, conf, false);
    headerIns.close();

    final FileStatus[] parts = partTmp.getFileSystem(conf)
            .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*"));
    for (FileStatus p : parts) {
        final FSDataInputStream ins = srcFS.open(p.getPath());
        IOUtils.copyBytes(ins, vcfgz, conf, false);
        ins.close();
    }
    vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    vcfgz.close();
    partTmp.getFileSystem(conf).delete(partTmp, true);
    return 0;
}

From source file:org.opencb.hpg.bigdata.tools.alignment.Bam2AvroMR.java

License:Apache License

public static int run(String input, String output, String codecName, boolean adjustQuality, Configuration conf)
        throws Exception {

    // read header, and save sequence index/name in conf
    final Path p = new Path(input);
    final SeekableStream seekableStream = WrapSeekable.openPath(conf, p);
    final SamReader reader = SamReaderFactory.make().open(SamInputResource.of(seekableStream));
    final SAMFileHeader header = reader.getFileHeader();
    int i = 0;/*from ww w. j a va2  s .co  m*/
    SAMSequenceRecord sr;
    while ((sr = header.getSequence(i)) != null) {
        conf.set("" + i, sr.getSequenceName());
        i++;
    }

    Job job = Job.getInstance(conf, "Bam2AvroMR");
    job.setJarByClass(Bam2AvroMR.class);

    // Avro problem fix
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");
    job.getConfiguration().set(ADJUST_QUALITY, Boolean.toString(adjustQuality));

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, ReadAlignment.getClassSchema());
    job.setOutputValueClass(NullWritable.class);
    AvroJob.setMapOutputValueSchema(job, ReadAlignment.getClassSchema());

    // point to input data
    FileInputFormat.setInputPaths(job, new Path(input));
    job.setInputFormatClass(AnySAMInputFormat.class);

    // set the output format
    FileOutputFormat.setOutputPath(job, new Path(output));
    if (codecName != null) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName));
    }
    job.setOutputFormatClass(AvroKeyOutputFormat.class);

    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(Void.class);

    job.setMapperClass(Bam2GaMapper.class);
    job.setNumReduceTasks(0);

    job.waitForCompletion(true);

    // write header
    Path headerPath = new Path(output + ".header");
    FileSystem fs = FileSystem.get(conf);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(headerPath, true)));
    br.write(header.getTextHeader());
    br.close();

    return 0;
}

From source file:org.opencb.hpg.bigdata.tools.sequence.Fastq2AvroMR.java

License:Apache License

public static int run(String input, String output, String codecName) throws Exception {
    Configuration conf = new Configuration();

    Job job = Job.getInstance(conf, "Fastq2AvroMR");
    job.setJarByClass(Fastq2AvroMR.class);

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, Read.SCHEMA$);
    job.setOutputValueClass(NullWritable.class);
    AvroJob.setMapOutputValueSchema(job, Read.SCHEMA$);

    // point to input data
    FileInputFormat.setInputPaths(job, new Path(input));
    job.setInputFormatClass(FastqInputFormatMODIF.class);

    // set the output format
    FileOutputFormat.setOutputPath(job, new Path(output));
    if (codecName != null) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName));
    }/*from w  ww  . j  a va2s.c  om*/
    job.setOutputFormatClass(AvroKeyOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(AvroValue.class);

    job.setMapperClass(Fastq2GaMapper.class);
    job.setReducerClass(Fastq2GaReducer.class);

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.seqdoop.hadoop_bam.TestVCFRoundTrip.java

License:Open Source License

private Path doMapReduce(final Path inputPath, final boolean writeHeader) throws Exception {
    final FileSystem fileSystem = FileSystem.get(conf);
    final Path outputPath = fileSystem.makeQualified(new Path("target/out"));
    fileSystem.delete(outputPath, true);

    final Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, inputPath);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(VariantContextWritable.class);

    job.setOutputFormatClass(// w w  w  .ja  v  a 2  s. c  o  m
            writeHeader ? VCFTestWithHeaderOutputFormat.class : VCFTestNoHeaderOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setNumReduceTasks(0);
    FileOutputFormat.setOutputPath(job, outputPath);
    if (codecClass != null) {
        FileOutputFormat.setOutputCompressorClass(job, codecClass);
    }

    final boolean success = job.waitForCompletion(true);
    assertTrue(success);

    return outputPath;
}