List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass
public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass)
From source file:org.apache.pig.builtin.PigStorage.java
License:Apache License
private void setCompression(Path path, Job job) { String location = path.getName(); if (location.endsWith(".bz2") || location.endsWith(".bz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else {//from w w w .j a v a 2 s .co m FileOutputFormat.setCompressOutput(job, false); } }
From source file:org.apache.pig.piggybank.storage.GAMultiStorage.java
License:Apache License
@Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if (comp == Compression.bz2 || comp == Compression.bz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (comp == Compression.gz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); }//w w w . j a v a 2 s .c o m }
From source file:org.apache.pig.piggybank.storage.MultiJsonStorage2.java
License:Apache License
@Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set("mapred.textoutputformat.separator", ""); //TODO: strip date path /*String[] parts = location.split("/"); String[] rootParts = (String[])ArrayUtils.subarray(parts, 1, parts.length - 3); String newLocation = StringUtils.join(rootParts, "/"); *///from ww w. jav a 2 s . co m FileOutputFormat.setOutputPath(job, new Path(location)); if (comp == Compression.bz2 || comp == Compression.bz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (comp == Compression.gz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } }
From source file:org.apache.pig.piggybank.storage.MultiStorage.java
License:Apache License
@Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, ""); FileOutputFormat.setOutputPath(job, new Path(location)); if (comp == Compression.bz2 || comp == Compression.bz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (comp == Compression.gz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); }// ww w. j a v a 2 s . c o m }
From source file:org.apache.sqoop.mapreduce.ImportJobBase.java
License:Apache License
/** * Configure the output format to use for the job. *///from ww w . ja va2 s . co m @Override protected void configureOutputFormat(Job job, String tableName, String tableClassName) throws ClassNotFoundException, IOException { job.setOutputFormatClass(getOutputFormatClass()); if (isHCatJob) { LOG.debug("Configuring output format for HCatalog import job"); SqoopHCatUtilities.configureImportOutputFormat(options, job, getContext().getConnManager(), tableName, job.getConfiguration()); return; } if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) { job.getConfiguration().set("mapred.output.value.class", tableClassName); } if (options.shouldUseCompression()) { FileOutputFormat.setCompressOutput(job, true); String codecName = options.getCompressionCodec(); Class<? extends CompressionCodec> codecClass; if (codecName == null) { codecClass = GzipCodec.class; } else { Configuration conf = job.getConfiguration(); codecClass = CodecMap.getCodec(codecName, conf).getClass(); } FileOutputFormat.setOutputCompressorClass(job, codecClass); if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) { SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); } // SQOOP-428: Avro expects not a fully qualified class name but a "short" // name instead (e.g. "snappy") and it needs to be set in a custom // configuration option called "avro.output.codec". // The default codec is "deflate". if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) { if (codecName != null) { String shortName = CodecMap.getCodecShortNameByName(codecName, job.getConfiguration()); // Avro only knows about "deflate" and not "default" if (shortName.equalsIgnoreCase("default")) { shortName = "deflate"; } job.getConfiguration().set(AvroJob.OUTPUT_CODEC, shortName); } else { job.getConfiguration().set(AvroJob.OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC); } } } Path outputPath = context.getDestination(); FileOutputFormat.setOutputPath(job, outputPath); }
From source file:org.archive.wayback.hadoop.CDXSortDriver.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job./* w w w . j a v a2 s .c o m*/ * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { String delim = " "; long desiredMaps = 10; boolean compressOutput = false; boolean compressedInput = false; boolean gzipRange = false; List<String> otherArgs = new ArrayList<String>(); int mapMode = CDXCanonicalizingMapper.MODE_FULL; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { desiredMaps = Integer.parseInt(args[++i]); } else if ("--compress-output".equals(args[i])) { compressOutput = true; } else if ("--compressed-input".equals(args[i])) { compressedInput = true; } else if ("--gzip-range".equals(args[i])) { gzipRange = true; } else if ("--delimiter".equals(args[i])) { delim = args[++i]; } else if ("--map-full".equals(args[i])) { mapMode = CDXCanonicalizingMapper.MODE_FULL; } else if ("--map-global".equals(args[i])) { mapMode = CDXCanonicalizingMapper.MODE_GLOBAL; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are exactly 3 parameters left: split input output if (otherArgs.size() != 3) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3."); return printUsage(); } String splitPathString = otherArgs.get(0); String inputPathString = otherArgs.get(1); String outputPathString = otherArgs.get(2); Path splitPath = new Path(splitPathString); Path inputPath = new Path(inputPathString); Path outputPath = new Path(outputPathString); Job job = new Job(getConf(), "cdx-sort"); Configuration conf = job.getConfiguration(); job.setJarByClass(CDXSortDriver.class); job.setMapperClass(CDXCanonicalizingMapper.class); job.setReducerClass(CDXReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // configure the "map mode" CDXCanonicalizingMapper.setMapMode(conf, mapMode); // set up the delimter: conf.set(TEXT_OUTPUT_DELIM_CONFIG, delim); if (compressOutput) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } // set up the Partitioner, including number of reduce tasks: FileSystem fs = inputPath.getFileSystem(conf); int splitCount = countLinesInPath(splitPath, conf); System.err.println("Split/Reduce count:" + splitCount); job.setNumReduceTasks(splitCount); AlphaPartitioner.setPartitionPath(conf, splitPathString); job.setPartitionerClass(AlphaPartitioner.class); // calculate the byte size to get the correct number of map tasks: FileStatus inputStatus = fs.getFileStatus(inputPath); long inputLen = inputStatus.getLen(); long bytesPerMap = (int) inputLen / desiredMaps; FileInputFormat.addInputPath(job, inputPath); FileInputFormat.setMaxInputSplitSize(job, bytesPerMap); if (gzipRange) { job.setInputFormatClass(GZIPRangeLineDereferencingInputFormat.class); } else { job.setInputFormatClass(LineDereferencingInputFormat.class); if (compressedInput) { LineDereferencingRecordReader.forceCompressed(conf); } } FileOutputFormat.setOutputPath(job, outputPath); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java
License:Open Source License
public int run(String[] args) throws Exception { final Configuration conf = getConf(); SortVcfOptions options = new SortVcfOptions(args); conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat()); conf.setBoolean("hadoopbam.vcf.write-header", false); Path inputPath = new Path(options.getInput()); FileSystem fs = inputPath.getFileSystem(conf); FileStatus[] files = fs.listStatus(inputPath); Path vcfHeaderPath = files[0].getPath(); if (options.getVcfHeader() != null) vcfHeaderPath = new Path(options.getVcfHeader()); if (files.length <= 0) { System.err.println("Input dir is empty!"); return 1; }// ww w . j av a2 s . c o m conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString()); conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName()); KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf); baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf)); VCFHeader vcfHeader = baseOF.getHeader(); Job job = Job.getInstance(conf, "VCFSort"); job.setJarByClass(SortVcf.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortVcfReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VariantContextWritable.class); job.setInputFormatClass(VCFInputFormat.class); job.setOutputFormatClass(MyVCFOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setNumReduceTasks(options.getReducerNum()); SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss"); String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date()); Path partTmp = new Path(tmpDir + "/temp"); VCFInputFormat.addInputPath(job, inputPath); if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job)) VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE); FileOutputFormat.setOutputPath(job, partTmp); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class); Path partitionFile; if (options.getPartitionFileString() == null) { partitionFile = new Path(tmpDir + "/_partitons.lst"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); System.out.println("vcf-sort :: Sampling..."); int numSamples = options.getNumSamples(); if (fs.getContentSummary(inputPath).getLength() < 10000000) { numSamples = 1; job.setNumReduceTasks(1); } InputSampler.writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples, numSamples)); } else { System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ..."); partitionFile = new Path(options.getPartitionFileString()); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); } if (!job.waitForCompletion(true)) { System.err.println("sort :: Job failed."); return 1; } final FileSystem srcFS = partTmp.getFileSystem(conf); Path headerPath = new Path(tmpDir + "/header.vcf.gz"); BGZFCodec bgzfCodec = new BGZFCodec(); OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath)); VariantContextWriterBuilder builder = new VariantContextWriterBuilder(); VariantContextWriter writer; writer = builder.setOutputVCFStream(new FilterOutputStream(os) { @Override public void close() throws IOException { this.out.flush(); } }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build(); writer.writeHeader(vcfHeader); os.close(); Path outputPath = new Path(options.getOutput()); final FileSystem dstFS = outputPath.getFileSystem(conf); OutputStream vcfgz = dstFS.create(outputPath); final FSDataInputStream headerIns = srcFS.open(headerPath); IOUtils.copyBytes(headerIns, vcfgz, conf, false); headerIns.close(); final FileStatus[] parts = partTmp.getFileSystem(conf) .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*")); for (FileStatus p : parts) { final FSDataInputStream ins = srcFS.open(p.getPath()); IOUtils.copyBytes(ins, vcfgz, conf, false); ins.close(); } vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); vcfgz.close(); partTmp.getFileSystem(conf).delete(partTmp, true); return 0; }
From source file:org.opencb.hpg.bigdata.tools.alignment.Bam2AvroMR.java
License:Apache License
public static int run(String input, String output, String codecName, boolean adjustQuality, Configuration conf) throws Exception { // read header, and save sequence index/name in conf final Path p = new Path(input); final SeekableStream seekableStream = WrapSeekable.openPath(conf, p); final SamReader reader = SamReaderFactory.make().open(SamInputResource.of(seekableStream)); final SAMFileHeader header = reader.getFileHeader(); int i = 0;/*from ww w. j a va2 s .co m*/ SAMSequenceRecord sr; while ((sr = header.getSequence(i)) != null) { conf.set("" + i, sr.getSequenceName()); i++; } Job job = Job.getInstance(conf, "Bam2AvroMR"); job.setJarByClass(Bam2AvroMR.class); // Avro problem fix job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set(ADJUST_QUALITY, Boolean.toString(adjustQuality)); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputKeySchema(job, ReadAlignment.getClassSchema()); job.setOutputValueClass(NullWritable.class); AvroJob.setMapOutputValueSchema(job, ReadAlignment.getClassSchema()); // point to input data FileInputFormat.setInputPaths(job, new Path(input)); job.setInputFormatClass(AnySAMInputFormat.class); // set the output format FileOutputFormat.setOutputPath(job, new Path(output)); if (codecName != null) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName)); } job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(Void.class); job.setMapperClass(Bam2GaMapper.class); job.setNumReduceTasks(0); job.waitForCompletion(true); // write header Path headerPath = new Path(output + ".header"); FileSystem fs = FileSystem.get(conf); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(headerPath, true))); br.write(header.getTextHeader()); br.close(); return 0; }
From source file:org.opencb.hpg.bigdata.tools.sequence.Fastq2AvroMR.java
License:Apache License
public static int run(String input, String output, String codecName) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "Fastq2AvroMR"); job.setJarByClass(Fastq2AvroMR.class); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputKeySchema(job, Read.SCHEMA$); job.setOutputValueClass(NullWritable.class); AvroJob.setMapOutputValueSchema(job, Read.SCHEMA$); // point to input data FileInputFormat.setInputPaths(job, new Path(input)); job.setInputFormatClass(FastqInputFormatMODIF.class); // set the output format FileOutputFormat.setOutputPath(job, new Path(output)); if (codecName != null) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName)); }/*from w ww . j a va2s.c om*/ job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(AvroValue.class); job.setMapperClass(Fastq2GaMapper.class); job.setReducerClass(Fastq2GaReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:org.seqdoop.hadoop_bam.TestVCFRoundTrip.java
License:Open Source License
private Path doMapReduce(final Path inputPath, final boolean writeHeader) throws Exception { final FileSystem fileSystem = FileSystem.get(conf); final Path outputPath = fileSystem.makeQualified(new Path("target/out")); fileSystem.delete(outputPath, true); final Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, inputPath); job.setInputFormatClass(VCFInputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(VariantContextWritable.class); job.setOutputFormatClass(// w w w .ja v a 2 s. c o m writeHeader ? VCFTestWithHeaderOutputFormat.class : VCFTestNoHeaderOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(VariantContextWritable.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); if (codecClass != null) { FileOutputFormat.setOutputCompressorClass(job, codecClass); } final boolean success = job.waitForCompletion(true); assertTrue(success); return outputPath; }