Example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension.

Prototype

String getDefaultExtension();

Source Link

Document

Get the default filename extension for this kind of compression.

Usage

From source file:com.yahoo.glimmer.indexing.preprocessor.ResourceRecordWriter.java

License:Open Source License

public ResourceRecordWriter(FileSystem fs, Path taskWorkPath, CompressionCodec codecIfAny) throws IOException {
    if (fs.exists(taskWorkPath)) {
        throw new IOException("Task work path already exists:" + taskWorkPath.toString());
    }/*from w w  w  .j  a  v  a 2  s .  c o  m*/
    fs.mkdirs(taskWorkPath);

    for (OUTPUT output : OUTPUT.values()) {
        OutputStream out;
        if (codecIfAny != null) {
            Path file = new Path(taskWorkPath, output.filename + codecIfAny.getDefaultExtension());
            out = fs.create(file, false);
            out = codecIfAny.createOutputStream(out);
        } else {
            Path file = new Path(taskWorkPath, output.filename);
            out = fs.create(file, false);
        }
        writersMap.put(output, new OutputStreamWriter(out, Charset.forName("UTF-8")));
    }

    Path file = new Path(taskWorkPath, "bySubject.bz2");
    OutputStream compressedOutputStream = fs.create(file, false);
    file = new Path(taskWorkPath, "bySubject.blockOffsets");
    bySubjectOffsetsOutputStream = fs.create(file, false);

    blockOffsetsBuilder = new BlockOffsets.Builder();
    // Create a Writer on a BZip2 compressed OutputStream with a small block
    // size( * 100K).
    uncompressedOutputStream = new BZip2OutputStream(compressedOutputStream, 1,
            new BZip2OutputStream.Callback() {
                @Override
                public void blockStart(long blockStartBitOffset) {
                    if (lastBlockStartBitOffset != 0) {
                        blockOffsetsBuilder.setBlockStart(lastBlockStartBitOffset, lastFirstDocId);
                    }
                    lastBlockStartBitOffset = blockStartBitOffset;
                }

                @Override
                public void finish(long totalBitsWritten) {
                    blockOffsetsBuilder.close(totalBitsWritten);
                }
            });
    bySubjectWriter = new OutputStreamWriter(uncompressedOutputStream);
}

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) throws Exception {
        String uri = args[0];//from   w  w  w . java 2  s.c  o  m
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);

        Path inputPath = new Path(uri);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            System.exit(1);
        }

        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

        InputStream in = null;
        OutputStream out = null;
        try {
            in = codec.createInputStream(fs.open(inputPath));
            out = fs.create(new Path(outputUri));
            IOUtils.copyBytes(in, out, conf);
        } finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(out);
        }
    }

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileWriter.java

License:Apache License

/**
 * Creates a WARC file, and opens it for writing. If a file with the same name already
 * exists, it is *overwritten*. Note that this is different behaviour from the other
 * constructor. Yes, this sucks. It will probably change in a future version.
 *
 * @param conf           The Hadoop configuration.
 * @param codec          If null, the file is uncompressed. If non-null, this compression codec
 *                       will be used. The codec's default file extension is appended to the filename.
 * @param workOutputPath The directory and filename prefix to which the data should be
 *                       written. We append a segment number and filename extensions to it.
 * @param progress       An object used by the mapred API for tracking a task's progress.
 * @throws IOException I/O exception// www . j av  a 2  s  . c  o  m
 */
public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress)
        throws IOException {
    this.conf = conf;
    this.codec = codec;
    this.workOutputPath = workOutputPath;
    this.progress = progress;
    this.extensionFormat = ".seg-%05d.warc" + (codec == null ? "" : codec.getDefaultExtension());
    this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
    createSegment();
}

From source file:dev.geminileft.outputformat.MyTextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get(SEPERATOR, "\t");
    String keyValueDelimiter = conf.get(DELIMITER, "\n");
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }// w  ww.  j a v a  2s  .c o  m
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator, keyValueDelimiter);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator, keyValueDelimiter);
    }
}

From source file:edu.arizona.cs.hadoop.fs.irods.output.HirodsTextOutputFormat.java

License:Apache License

@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t");
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*from ww  w. j a va2s. c  o m*/
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:edu.rutgers.ess.crs.utility.KeyValueCSVOutputFormat.java

License:Apache License

public RecordWriter<Text, TextArrayWritable> getRecordWriter(final TaskAttemptContext context)
        throws IOException, InterruptedException {
    final Configuration conf = context.getConfiguration();
    final boolean isCompressed = getCompressOutput((JobContext) context);
    final String tokenSeparator = conf.get(KeyValueCSVOutputFormat.CSV_TOKEN_SEPARATOR_CONFIG);
    final String keyValueSeparator = conf.get(KeyValueCSVOutputFormat.CSV_KEYVALUE_SEPARATOR_CONFIG);
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        final Class<? extends CompressionCodec> codecClass = (Class<? extends CompressionCodec>) getOutputCompressorClass(
                (JobContext) context, GzipCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }//from ww w  .  ja  va 2s . c  om
    final Path file = this.getDefaultWorkFile(context, extension);
    final FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        final FSDataOutputStream fileOut = fs.create(file, false);
        return new KeyValueCSVRecordWriter((DataOutputStream) fileOut, tokenSeparator, keyValueSeparator);
    }
    final FSDataOutputStream fileOut = fs.create(file, false);
    return new KeyValueCSVRecordWriter(
            new DataOutputStream((OutputStream) codec.createOutputStream((OutputStream) fileOut)),
            tokenSeparator, keyValueSeparator);
}

From source file:edu.umn.cs.spatialHadoop.core.GridRecordWriter.java

License:Open Source License

/**
 * Returns path to a file in which the final cell will be written.
 * @param cellIndex The index of the cell to retrieve its output path.
 * @return/*from   w  w  w  . j a va  2  s .c  o  m*/
 * @throws IOException
 */
protected Path getFinalCellPath(int cellIndex) throws IOException {
    Path path;
    do {
        String filename = counter == 0 ? String.format("data_%05d", cellIndex)
                : String.format("data_%05d_%d", cellIndex, counter);
        boolean isCompressed = jobConf != null && FileOutputFormat.getCompressOutput(jobConf);
        if (isCompressed) {
            Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jobConf,
                    GzipCodec.class);
            // create the named codec
            CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jobConf);
            filename += codec.getDefaultExtension();
        }

        path = getFilePath(filename);
        counter++;
    } while (fileSystem.exists(path));
    return path;
}

From source file:edu.umn.cs.spatialHadoop.mapred.TextOutputFormat3.java

License:Open Source License

@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext task) throws IOException, InterruptedException {
    Configuration conf = task.getConfiguration();
    boolean isCompressed = getCompressOutput(task);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t");
    if (!isCompressed) {
        Path file = getDefaultWorkFile(task, "");
        FileSystem fs = file.getFileSystem(conf);
        FSDataOutputStream fileOut = fs.create(file, task);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {//from w  ww .  j  av a2 s  .c  o  m
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class);
        // create the named codec
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
        // build the filename including the extension
        Path file = getDefaultWorkFile(task, codec.getDefaultExtension());
        FileSystem fs = file.getFileSystem(conf);
        FSDataOutputStream fileOut = fs.create(file, task);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:fi.tkk.ics.hadoop.bam.FastqOutputFormat.java

License:Open Source License

public RecordWriter<Text, SequencedFragment> getRecordWriter(TaskAttemptContext task) throws IOException {
    Configuration conf = ContextUtil.getConfiguration(task);
    boolean isCompressed = getCompressOutput(task);

    CompressionCodec codec = null;
    String extension = "";

    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*from  w w w .j a  v  a 2s . c  o m*/

    Path file = getDefaultWorkFile(task, extension);
    FileSystem fs = file.getFileSystem(conf);

    DataOutputStream output;

    if (isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        output = new DataOutputStream(codec.createOutputStream(fileOut));
    } else
        output = fs.create(file, false);

    return new FastqRecordWriter(conf, output);
}

From source file:fi.tkk.ics.hadoop.bam.QseqOutputFormat.java

License:Open Source License

public RecordWriter<Text, SequencedFragment> getRecordWriter(TaskAttemptContext task) throws IOException {
    Configuration conf = ContextUtil.getConfiguration(task);
    boolean isCompressed = getCompressOutput(task);

    CompressionCodec codec = null;
    String extension = "";

    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*from w  w  w . j  av  a  2 s . c  om*/

    Path file = getDefaultWorkFile(task, extension);
    FileSystem fs = file.getFileSystem(conf);

    DataOutputStream output;

    if (isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        output = new DataOutputStream(codec.createOutputStream(fileOut));
    } else
        output = fs.create(file, false);

    return new QseqRecordWriter(conf, output);
}