List of usage examples for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension
String getDefaultExtension();
From source file:com.yahoo.glimmer.indexing.preprocessor.ResourceRecordWriter.java
License:Open Source License
public ResourceRecordWriter(FileSystem fs, Path taskWorkPath, CompressionCodec codecIfAny) throws IOException { if (fs.exists(taskWorkPath)) { throw new IOException("Task work path already exists:" + taskWorkPath.toString()); }/*from w w w .j a v a 2 s . c o m*/ fs.mkdirs(taskWorkPath); for (OUTPUT output : OUTPUT.values()) { OutputStream out; if (codecIfAny != null) { Path file = new Path(taskWorkPath, output.filename + codecIfAny.getDefaultExtension()); out = fs.create(file, false); out = codecIfAny.createOutputStream(out); } else { Path file = new Path(taskWorkPath, output.filename); out = fs.create(file, false); } writersMap.put(output, new OutputStreamWriter(out, Charset.forName("UTF-8"))); } Path file = new Path(taskWorkPath, "bySubject.bz2"); OutputStream compressedOutputStream = fs.create(file, false); file = new Path(taskWorkPath, "bySubject.blockOffsets"); bySubjectOffsetsOutputStream = fs.create(file, false); blockOffsetsBuilder = new BlockOffsets.Builder(); // Create a Writer on a BZip2 compressed OutputStream with a small block // size( * 100K). uncompressedOutputStream = new BZip2OutputStream(compressedOutputStream, 1, new BZip2OutputStream.Callback() { @Override public void blockStart(long blockStartBitOffset) { if (lastBlockStartBitOffset != 0) { blockOffsetsBuilder.setBlockStart(lastBlockStartBitOffset, lastFirstDocId); } lastBlockStartBitOffset = blockStartBitOffset; } @Override public void finish(long totalBitsWritten) { blockOffsetsBuilder.close(totalBitsWritten); } }); bySubjectWriter = new OutputStreamWriter(uncompressedOutputStream); }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws Exception { String uri = args[0];//from w w w . java 2 s.c o m Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream in = null; OutputStream out = null; try { in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileWriter.java
License:Apache License
/** * Creates a WARC file, and opens it for writing. If a file with the same name already * exists, it is *overwritten*. Note that this is different behaviour from the other * constructor. Yes, this sucks. It will probably change in a future version. * * @param conf The Hadoop configuration. * @param codec If null, the file is uncompressed. If non-null, this compression codec * will be used. The codec's default file extension is appended to the filename. * @param workOutputPath The directory and filename prefix to which the data should be * written. We append a segment number and filename extensions to it. * @param progress An object used by the mapred API for tracking a task's progress. * @throws IOException I/O exception// www . j av a 2 s . c o m */ public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress) throws IOException { this.conf = conf; this.codec = codec; this.workOutputPath = workOutputPath; this.progress = progress; this.extensionFormat = ".seg-%05d.warc" + (codec == null ? "" : codec.getDefaultExtension()); this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE); createSegment(); }
From source file:dev.geminileft.outputformat.MyTextOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); String keyValueSeparator = conf.get(SEPERATOR, "\t"); String keyValueDelimiter = conf.get(DELIMITER, "\n"); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); }// w ww. j a v a 2s .c o m Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator, keyValueDelimiter); } else { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator, keyValueDelimiter); } }
From source file:edu.arizona.cs.hadoop.fs.irods.output.HirodsTextOutputFormat.java
License:Apache License
@Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t"); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); }/*from ww w. j a va2s. c o m*/ Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } else { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } }
From source file:edu.rutgers.ess.crs.utility.KeyValueCSVOutputFormat.java
License:Apache License
public RecordWriter<Text, TextArrayWritable> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { final Configuration conf = context.getConfiguration(); final boolean isCompressed = getCompressOutput((JobContext) context); final String tokenSeparator = conf.get(KeyValueCSVOutputFormat.CSV_TOKEN_SEPARATOR_CONFIG); final String keyValueSeparator = conf.get(KeyValueCSVOutputFormat.CSV_KEYVALUE_SEPARATOR_CONFIG); CompressionCodec codec = null; String extension = ""; if (isCompressed) { final Class<? extends CompressionCodec> codecClass = (Class<? extends CompressionCodec>) getOutputCompressorClass( (JobContext) context, GzipCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); }//from ww w . ja va 2s . c om final Path file = this.getDefaultWorkFile(context, extension); final FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { final FSDataOutputStream fileOut = fs.create(file, false); return new KeyValueCSVRecordWriter((DataOutputStream) fileOut, tokenSeparator, keyValueSeparator); } final FSDataOutputStream fileOut = fs.create(file, false); return new KeyValueCSVRecordWriter( new DataOutputStream((OutputStream) codec.createOutputStream((OutputStream) fileOut)), tokenSeparator, keyValueSeparator); }
From source file:edu.umn.cs.spatialHadoop.core.GridRecordWriter.java
License:Open Source License
/** * Returns path to a file in which the final cell will be written. * @param cellIndex The index of the cell to retrieve its output path. * @return/*from w w w . j a va 2 s .c o m*/ * @throws IOException */ protected Path getFinalCellPath(int cellIndex) throws IOException { Path path; do { String filename = counter == 0 ? String.format("data_%05d", cellIndex) : String.format("data_%05d_%d", cellIndex, counter); boolean isCompressed = jobConf != null && FileOutputFormat.getCompressOutput(jobConf); if (isCompressed) { Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jobConf, GzipCodec.class); // create the named codec CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jobConf); filename += codec.getDefaultExtension(); } path = getFilePath(filename); counter++; } while (fileSystem.exists(path)); return path; }
From source file:edu.umn.cs.spatialHadoop.mapred.TextOutputFormat3.java
License:Open Source License
@Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext task) throws IOException, InterruptedException { Configuration conf = task.getConfiguration(); boolean isCompressed = getCompressOutput(task); String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t"); if (!isCompressed) { Path file = getDefaultWorkFile(task, ""); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, task); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } else {//from w ww . j av a2 s .c o m Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class); // create the named codec CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); // build the filename including the extension Path file = getDefaultWorkFile(task, codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, task); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } }
From source file:fi.tkk.ics.hadoop.bam.FastqOutputFormat.java
License:Open Source License
public RecordWriter<Text, SequencedFragment> getRecordWriter(TaskAttemptContext task) throws IOException { Configuration conf = ContextUtil.getConfiguration(task); boolean isCompressed = getCompressOutput(task); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); }/*from w w w .j a v a 2s . c o m*/ Path file = getDefaultWorkFile(task, extension); FileSystem fs = file.getFileSystem(conf); DataOutputStream output; if (isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); output = new DataOutputStream(codec.createOutputStream(fileOut)); } else output = fs.create(file, false); return new FastqRecordWriter(conf, output); }
From source file:fi.tkk.ics.hadoop.bam.QseqOutputFormat.java
License:Open Source License
public RecordWriter<Text, SequencedFragment> getRecordWriter(TaskAttemptContext task) throws IOException { Configuration conf = ContextUtil.getConfiguration(task); boolean isCompressed = getCompressOutput(task); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); }/*from w w w . j av a 2 s . c om*/ Path file = getDefaultWorkFile(task, extension); FileSystem fs = file.getFileSystem(conf); DataOutputStream output; if (isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); output = new DataOutputStream(codec.createOutputStream(fileOut)); } else output = fs.create(file, false); return new QseqRecordWriter(conf, output); }