Example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

List of usage examples for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension.

Prototype

String getDefaultExtension();

Source Link

Document

Get the default filename extension for this kind of compression.

Usage

From source file:com.yahoo.glimmer.indexing.preprocessor.ResourceRecordWriter.java

License:Open Source License

public ResourceRecordWriter(FileSystem fs, Path taskWorkPath, CompressionCodec codecIfAny) throws IOException {
    if (fs.exists(taskWorkPath)) {
        throw new IOException("Task work path already exists:" + taskWorkPath.toString());
    }/*from w w  w  .j  a  v  a 2  s .  c o  m*/
    fs.mkdirs(taskWorkPath);

    for (OUTPUT output : OUTPUT.values()) {
        OutputStream out;
        if (codecIfAny != null) {
            Path file = new Path(taskWorkPath, output.filename + codecIfAny.getDefaultExtension());
            out = fs.create(file, false);
            out = codecIfAny.createOutputStream(out);
        } else {
            Path file = new Path(taskWorkPath, output.filename);
            out = fs.create(file, false);
        }
        writersMap.put(output, new OutputStreamWriter(out, Charset.forName("UTF-8")));
    }

    Path file = new Path(taskWorkPath, "bySubject.bz2");
    OutputStream compressedOutputStream = fs.create(file, false);
    file = new Path(taskWorkPath, "bySubject.blockOffsets");
    bySubjectOffsetsOutputStream = fs.create(file, false);

    blockOffsetsBuilder = new BlockOffsets.Builder();
    // Create a Writer on a BZip2 compressed OutputStream with a small block
    // size( * 100K).
    uncompressedOutputStream = new BZip2OutputStream(compressedOutputStream, 1,
            new BZip2OutputStream.Callback() {
                @Override
                public void blockStart(long blockStartBitOffset) {
                    if (lastBlockStartBitOffset != 0) {
                        blockOffsetsBuilder.setBlockStart(lastBlockStartBitOffset, lastFirstDocId);
                    }
                    lastBlockStartBitOffset = blockStartBitOffset;
                }

                @Override
                public void finish(long totalBitsWritten) {
                    blockOffsetsBuilder.close(totalBitsWritten);
                }
            });
    bySubjectWriter = new OutputStreamWriter(uncompressedOutputStream);
}

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) throws Exception {
        String uri = args[0];//from   w  w  w . java 2  s.c  o  m
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);

        Path inputPath = new Path(uri);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            System.exit(1);
        }

        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

        InputStream in = null;
        OutputStream out = null;
        try {
            in = codec.createInputStream(fs.open(inputPath));
            out = fs.create(new Path(outputUri));
            IOUtils.copyBytes(in, out, conf);
        } finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(out);
        }
    }

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileWriter.java

License:Apache License

/**
 * Creates a WARC file, and opens it for writing. If a file with the same name already
 * exists, it is *overwritten*. Note that this is different behaviour from the other
 * constructor. Yes, this sucks. It will probably change in a future version.
 *
 * @param conf           The Hadoop configuration.
 * @param codec          If null, the file is uncompressed. If non-null, this compression codec
 *                       will be used. The codec's default file extension is appended to the filename.
 * @param workOutputPath The directory and filename prefix to which the data should be
 *                       written. We append a segment number and filename extensions to it.
 * @param progress       An object used by the mapred API for tracking a task's progress.
 * @throws IOException I/O exception// www . j av  a 2  s  . c  o  m
 */
public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress)
        throws IOException {
    this.conf = conf;
    this.codec = codec;
    this.workOutputPath = workOutputPath;
    this.progress = progress;
    this.extensionFormat = ".seg-%05d.warc" + (codec == null ? "" : codec.getDefaultExtension());
    this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
    createSegment();
}

From source file:dev.geminileft.outputformat.MyTextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get(SEPERATOR, "\t");
    String keyValueDelimiter = conf.get(DELIMITER, "\n");
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }// w  ww.  j a v a  2s  .c o  m
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator, keyValueDelimiter);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator, keyValueDelimiter);
    }
}

From source file:edu.arizona.cs.hadoop.fs.irods.output.HirodsTextOutputFormat.java

License:Apache License

@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t");
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*from ww  w. j a va2s. c  o m*/
    Path file = getDefaultWorkFile(job, extension);
    FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:edu.rutgers.ess.crs.utility.KeyValueCSVOutputFormat.java

License:Apache License

public RecordWriter<Text, TextArrayWritable> getRecordWriter(final TaskAttemptContext context)
        throws IOException, InterruptedException {
    final Configuration conf = context.getConfiguration();
    final boolean isCompressed = getCompressOutput((JobContext) context);
    final String tokenSeparator = conf.get(KeyValueCSVOutputFormat.CSV_TOKEN_SEPARATOR_CONFIG);
    final String keyValueSeparator = conf.get(KeyValueCSVOutputFormat.CSV_KEYVALUE_SEPARATOR_CONFIG);
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        final Class<? extends CompressionCodec> codecClass = (Class<? extends CompressionCodec>) getOutputCompressorClass(
                (JobContext) context, GzipCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }//from ww w  .  ja  va 2s . c  om
    final Path file = this.getDefaultWorkFile(context, extension);
    final FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        final FSDataOutputStream fileOut = fs.create(file, false);
        return new KeyValueCSVRecordWriter((DataOutputStream) fileOut, tokenSeparator, keyValueSeparator);
    }
    final FSDataOutputStream fileOut = fs.create(file, false);
    return new KeyValueCSVRecordWriter(
            new DataOutputStream((OutputStream) codec.createOutputStream((OutputStream) fileOut)),
            tokenSeparator, keyValueSeparator);
}

From source file:edu.umn.cs.spatialHadoop.core.GridRecordWriter.java

License:Open Source License

/**
 * Returns path to a file in which the final cell will be written.
 * @param cellIndex The index of the cell to retrieve its output path.
 * @return/*from   w  w  w  . j a va  2  s .c  o  m*/
 * @throws IOException
 */
protected Path getFinalCellPath(int cellIndex) throws IOException {
    Path path;
    do {
        String filename = counter == 0 ? String.format("data_%05d", cellIndex)
                : String.format("data_%05d_%d", cellIndex, counter);
        boolean isCompressed = jobConf != null && FileOutputFormat.getCompressOutput(jobConf);
        if (isCompressed) {
            Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jobConf,
                    GzipCodec.class);
            // create the named codec
            CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jobConf);
            filename += codec.getDefaultExtension();
        }

        path = getFilePath(filename);
        counter++;
    } while (fileSystem.exists(path));
    return path;
}

From source file:edu.umn.cs.spatialHadoop.mapred.TextOutputFormat3.java

License:Open Source License

@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext task) throws IOException, InterruptedException {
    Configuration conf = task.getConfiguration();
    boolean isCompressed = getCompressOutput(task);
    String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t");
    if (!isCompressed) {
        Path file = getDefaultWorkFile(task, "");
        FileSystem fs = file.getFileSystem(conf);
        FSDataOutputStream fileOut = fs.create(file, task);
        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {//from w  ww .  j  av a2 s  .c  o  m
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class);
        // create the named codec
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
        // build the filename including the extension
        Path file = getDefaultWorkFile(task, codec.getDefaultExtension());
        FileSystem fs = file.getFileSystem(conf);
        FSDataOutputStream fileOut = fs.create(file, task);
        return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                keyValueSeparator);
    }
}

From source file:fi.tkk.ics.hadoop.bam.FastqOutputFormat.java

License:Open Source License

public RecordWriter<Text, SequencedFragment> getRecordWriter(TaskAttemptContext task) throws IOException {
    Configuration conf = ContextUtil.getConfiguration(task);
    boolean isCompressed = getCompressOutput(task);

    CompressionCodec codec = null;
    String extension = "";

    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*from  w w w .j a  v  a 2s . c  o m*/

    Path file = getDefaultWorkFile(task, extension);
    FileSystem fs = file.getFileSystem(conf);

    DataOutputStream output;

    if (isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        output = new DataOutputStream(codec.createOutputStream(fileOut));
    } else
        output = fs.create(file, false);

    return new FastqRecordWriter(conf, output);
}

From source file:fi.tkk.ics.hadoop.bam.QseqOutputFormat.java

License:Open Source License

public RecordWriter<Text, SequencedFragment> getRecordWriter(TaskAttemptContext task) throws IOException {
    Configuration conf = ContextUtil.getConfiguration(task);
    boolean isCompressed = getCompressOutput(task);

    CompressionCodec codec = null;
    String extension = "";

    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }/*from w  w  w . j  av  a  2 s . c  om*/

    Path file = getDefaultWorkFile(task, extension);
    FileSystem fs = file.getFileSystem(conf);

    DataOutputStream output;

    if (isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        output = new DataOutputStream(codec.createOutputStream(fileOut));
    } else
        output = fs.create(file, false);

    return new QseqRecordWriter(conf, output);
}