Example usage for org.apache.hadoop.io.compress DefaultCodec DefaultCodec

List of usage examples for org.apache.hadoop.io.compress DefaultCodec DefaultCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress DefaultCodec DefaultCodec.

Prototype

DefaultCodec

Source Link

Usage

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

@Test
public void testGetWriter() throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    final String prefix = "prefix";
    String template = getTestDir().toString()
            + "/${YYYY()}/${MM()}/${DD()}/${hh()}/${mm()}/${ss()}/${record:value('/')}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 5;
    long cutOffRecords = 2;
    HdfsFileType fileType = HdfsFileType.TEXT;
    DefaultCodec compressionCodec = new DefaultCodec();
    compressionCodec.setConf(conf);/*from w w  w  .j  a v a  2 s .c o m*/
    SequenceFile.CompressionType compressionType = null;
    String keyEL = null;
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));

    FileSystem fs = FileSystem.get(uri, conf);
    Date now = getFixedDate();

    // record older than cut off
    Date recordDate = new Date(now.getTime() - 10 * 1000 - 1);
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    Assert.assertNull(mgr.getWriter(now, recordDate, record));

    // record qualifies, first file
    recordDate = new Date(now.getTime() - 10 * 1000 + 1);
    RecordWriter writer = mgr.getWriter(now, recordDate, record);
    Assert.assertNotNull(writer);
    Path tempPath = writer.getPath();
    Assert.assertEquals(mgr.getPath(recordDate, record), tempPath);
    Path finalPath = mgr.commitWriter(writer);
    //committing a closed writer is a NOP
    Assert.assertNull(mgr.commitWriter(writer));

    Assert.assertEquals(1, getFinalFileNameCount(fs, tempPath.getParent(), prefix));

    // record qualifies, second file
    writer = mgr.getWriter(now, recordDate, record);
    finalPath = mgr.commitWriter(writer);

    Assert.assertEquals(2, getFinalFileNameCount(fs, tempPath.getParent(), prefix));

    // record qualifies, leaving temp file
    writer = mgr.getWriter(now, recordDate, record);
    writer.close();

    // record qualifies, it should roll temp file and create 4th file
    writer = mgr.getWriter(now, recordDate, record);
    finalPath = mgr.commitWriter(writer);
    Assert.assertFalse(fs.exists(tempPath));
    Assert.assertEquals(4, getFinalFileNameCount(fs, tempPath.getParent(), prefix));

    // verifying thresholds because of record count
    writer = mgr.getWriter(now, recordDate, record);
    Assert.assertFalse(mgr.isOverThresholds(writer));
    writer.write(record);
    writer.flush();
    Assert.assertFalse(mgr.isOverThresholds(writer));
    writer.write(record);
    writer.flush();
    Assert.assertTrue(mgr.isOverThresholds(writer));
    writer.write(record);
    mgr.commitWriter(writer);

    // verifying thresholds because of file size
    writer = mgr.getWriter(now, recordDate, record);
    Assert.assertFalse(mgr.isOverThresholds(writer));
    record.set(Field.create("0123456789012345678901234567890123456789012345678901234567890123456789"));
    writer.write(record);
    writer.flush();
    Assert.assertTrue(mgr.isOverThresholds(writer));
    mgr.commitWriter(writer);
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

@Test
public void testThresholdRecords() throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    String template = getTestDir().toString()
            + "/${YYYY()}/${MM()}/${DD()}/${hh()}/${mm()}/${ss()}/${record:value('/')}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 50000;
    long cutOffRecords = 2;
    HdfsFileType fileType = HdfsFileType.TEXT;
    DefaultCodec compressionCodec = new DefaultCodec();
    compressionCodec.setConf(conf);/*from www . j  av  a  2  s  .  c  o  m*/
    SequenceFile.CompressionType compressionType = null;
    String keyEL = null;
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));

    Date now = getFixedDate();

    Date recordDate = now;
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    RecordWriter writer = mgr.getWriter(now, recordDate, record);
    Assert.assertNotNull(writer);
    for (int i = 0; i < 2; i++) {
        Assert.assertFalse(mgr.isOverThresholds(writer));
        writer.write(record);
        writer.flush();
    }
    Assert.assertTrue(mgr.isOverThresholds(writer));
    mgr.commitWriter(writer);
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

@Test
public void testThresholdSize() throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    String template = getTestDir().toString()
            + "/${YYYY()}/${MM()}/${DD()}/${hh()}/${mm()}/${ss()}/${record:value('/')}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 4;
    long cutOffRecords = 20;
    HdfsFileType fileType = HdfsFileType.TEXT;
    DefaultCodec compressionCodec = new DefaultCodec();
    compressionCodec.setConf(conf);//ww  w  . java2s.  com
    SequenceFile.CompressionType compressionType = null;
    String keyEL = null;
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));
    Date now = getFixedDate();

    Date recordDate = now;
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    RecordWriter writer = mgr.getWriter(now, recordDate, record);
    Assert.assertNotNull(writer);
    for (int i = 0; i < 2; i++) {
        Assert.assertFalse(mgr.isOverThresholds(writer));
        writer.write(record);
        writer.flush();
    }
    Assert.assertTrue(mgr.isOverThresholds(writer));
    mgr.commitWriter(writer);
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

@Test
public void testNoThreshold() throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    String template = getTestDir().toString()
            + "/${YYYY()}/${MM()}/${DD()}/${hh()}/${mm()}/${ss()}/${record:value('/')}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 0;
    long cutOffRecords = 0;
    HdfsFileType fileType = HdfsFileType.TEXT;
    DefaultCodec compressionCodec = new DefaultCodec();
    compressionCodec.setConf(conf);//from w w w.jav  a2  s.co  m
    SequenceFile.CompressionType compressionType = null;
    String keyEL = null;
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));
    Date now = getFixedDate();

    Date recordDate = now;
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    RecordWriter writer = mgr.getWriter(now, recordDate, record);
    Assert.assertNotNull(writer);
    for (int i = 0; i < 10; i++) {
        Assert.assertFalse(mgr.isOverThresholds(writer));
        writer.write(record);
        writer.flush();
    }
    Assert.assertFalse(mgr.isOverThresholds(writer));
    mgr.commitWriter(writer);
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

private RecordWriterManager getRecordWriterManager(String dirTemplate, long cutOffSecs) throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSize = 20;
    long cutOffRecords = 2;
    HdfsFileType fileType = HdfsFileType.TEXT;
    SequenceFile.CompressionType compressionType = null;
    String keyEL = null;/*ww  w . ja va2s  .c om*/
    DefaultCodec compressionCodec = new DefaultCodec();
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);

    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, dirTemplate, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));
    return mgr;
}

From source file:crush.CrushUtil.java

License:Apache License

public void crush() throws CrushException {
    if (jobConf == null) {
        jobConf = new JobConf(CrushUtil.class);
    }//w  w  w  . j  av a 2  s. co  m
    if (codec == null) {
        codec = new DefaultCodec();
        l4j.warn("codec not specified using DefaultCodec");
    }
    if (compressionType == null) {
        this.compressionType = SequenceFile.CompressionType.BLOCK;
        l4j.warn("compresstionType not specified using BLOCK");
    }
    try {
        if (fs == null) {
            fs = FileSystem.get(jobConf);
        }
        if (!fs.exists(sourcePath)) {
            throw new CrushException(sourcePath + " does not exist");
        }
        if (fs.isFile(sourcePath)) {
            throw new CrushException(sourcePath + " must be a directory");
        }
        FileStatus[] status = fs.listStatus(sourcePath);
        if (status.length == 0 || status.length == 1) {
            return;
        }
        if (this.type == CrushUtil.FileType.SEQUENCEFILE) {
            sequenceCrush(fs, status);
        }
        if (this.type == CrushUtil.FileType.TEXT) {
            textCrush(fs, status);
        }
    } catch (IOException ex) {
        throw new CrushException("Crushed failed" + ex);
    }
}

From source file:dima.kmeansseq.SequenceFile.java

License:Apache License

/**
 * Construct the preferred type of SequenceFile Writer.
 * /*from   w  ww .j a va2 s .  co m*/
 * @param fs
 *            The configured filesystem.
 * @param conf
 *            The configuration.
 * @param name
 *            The name of the file.
 * @param keyClass
 *            The 'key' type.
 * @param valClass
 *            The 'value' type.
 * @param compressionType
 *            The compression type.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer createWriter(FileSystem fs, Configuration conf, Path name, Class keyClass, Class valClass,
        CompressionType compressionType) throws IOException {
    return createWriter(fs, conf, name, keyClass, valClass, fs.getConf().getInt("io.file.buffer.size", 4096),
            fs.getDefaultReplication(), fs.getDefaultBlockSize(), compressionType, new DefaultCodec(), null,
            new Metadata());
}

From source file:dima.kmeansseq.SequenceFile.java

License:Apache License

/**
 * Construct the preferred type of SequenceFile Writer.
 * /*w  ww  .java  2 s .  c o m*/
 * @param fs
 *            The configured filesystem.
 * @param conf
 *            The configuration.
 * @param name
 *            The name of the file.
 * @param keyClass
 *            The 'key' type.
 * @param valClass
 *            The 'value' type.
 * @param compressionType
 *            The compression type.
 * @param progress
 *            The Progressable object to track progress.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer createWriter(FileSystem fs, Configuration conf, Path name, Class keyClass, Class valClass,
        CompressionType compressionType, Progressable progress) throws IOException {
    return createWriter(fs, conf, name, keyClass, valClass, fs.getConf().getInt("io.file.buffer.size", 4096),
            fs.getDefaultReplication(), fs.getDefaultBlockSize(), compressionType, new DefaultCodec(), progress,
            new Metadata());
}

From source file:edu.bigdata.training.fileformats.compress.SequenceFileWriter.java

public static void main(String[] args) throws IOException {
    String uri = "output";
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path path = new Path(uri);
    IntWritable key = new IntWritable();
    Text value = new Text();
    File infile = new File("src/main/resources/input.txt");
    SequenceFile.Writer writer = null;
    try {//  w w w  .j a v a 2s.  c  om
        writer = SequenceFile.createWriter(conf, Writer.file(path), Writer.keyClass(key.getClass()),
                Writer.valueClass(value.getClass()),
                Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size", 4096)),
                Writer.replication(fs.getDefaultReplication()), Writer.blockSize(1073741824),
                Writer.compression(SequenceFile.CompressionType.BLOCK, new DefaultCodec()),
                Writer.progressable(null), Writer.metadata(new Metadata()));
        int ctr = 100;
        List<String> lines = FileUtils.readLines(infile);
        for (String line : lines) {
            key.set(ctr++);
            value.set(line);
            if (ctr < 150) {
                System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value);
            }
            writer.append(key, value);
        }
    } finally {
        IOUtils.closeStream(writer);
    }
}

From source file:gobblin.metastore.FsStateStore.java

License:Apache License

/**
 * See {@link StateStore#put(String, String, T)}.
 *
 * <p>/* w  ww  . ja v  a  2 s  .c o m*/
 *   This implementation does not support putting the state object into an existing store as
 *   append is to be supported by the Hadoop SequenceFile (HADOOP-7139).
 * </p>
 */
@Override
public void put(String storeName, String tableName, T state) throws IOException {
    String tmpTableName = this.useTmpFileForPut ? TMP_FILE_PREFIX + tableName : tableName;
    Path tmpTablePath = new Path(new Path(this.storeRootDir, storeName), tmpTableName);

    if (!this.fs.exists(tmpTablePath) && !create(storeName, tmpTableName)) {
        throw new IOException("Failed to create a state file for table " + tmpTableName);
    }

    Closer closer = Closer.create();
    try {
        @SuppressWarnings("deprecation")
        SequenceFile.Writer writer = closer.register(SequenceFile.createWriter(this.fs, this.conf, tmpTablePath,
                Text.class, this.stateClass, SequenceFile.CompressionType.BLOCK, new DefaultCodec()));
        writer.append(new Text(Strings.nullToEmpty(state.getId())), state);
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }

    if (this.useTmpFileForPut) {
        Path tablePath = new Path(new Path(this.storeRootDir, storeName), tableName);
        HadoopUtils.renamePath(this.fs, tmpTablePath, tablePath);
    }
}