Example usage for org.apache.hadoop.io SequenceFile SYNC_INTERVAL

List of usage examples for org.apache.hadoop.io SequenceFile SYNC_INTERVAL

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile SYNC_INTERVAL.

Prototype

int SYNC_INTERVAL

To view the source code for org.apache.hadoop.io SequenceFile SYNC_INTERVAL.

Click Source Link

Document

The number of bytes between sync points.

Usage

From source file:com.google.cloud.dataflow.contrib.hadoop.HadoopFileSourceTest.java

License:Apache License

@Test
public void testSplits() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();

    List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0);
    File file = createFileWithData("tmp.avro", expectedResults);

    HadoopFileSource<IntWritable, Text> source = HadoopFileSource.from(file.toString(),
            SequenceFileInputFormat.class, IntWritable.class, Text.class);

    // Assert that the source produces the expected records
    assertEquals(expectedResults, readFromSource(source, options));

    // Split with a small bundle size (has to be at least size of sync interval)
    List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source
            .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options);
    assertTrue(splits.size() > 2);/*from   w w  w.java2s  .co  m*/
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    int nonEmptySplits = 0;
    for (BoundedSource<KV<IntWritable, Text>> subSource : splits) {
        if (readFromSource(subSource, options).size() > 0) {
            nonEmptySplits += 1;
        }
    }
    assertTrue(nonEmptySplits > 2);
}

From source file:com.google.cloud.dataflow.sdk.io.hdfs.HDFSFileSourceTest.java

License:Apache License

@Test
public void testSplits() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();

    List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0);
    File file = createFileWithData("tmp.seq", expectedResults);

    HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from(file.toString(),
            SequenceFileInputFormat.class, IntWritable.class, Text.class);

    // Assert that the source produces the expected records
    assertEquals(expectedResults, readFromSource(source, options));

    // Split with a small bundle size (has to be at least size of sync interval)
    List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source
            .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options);
    assertTrue(splits.size() > 2);// www. ja v  a  2 s  . c o m
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    int nonEmptySplits = 0;
    for (BoundedSource<KV<IntWritable, Text>> subSource : splits) {
        if (readFromSource(subSource, options).size() > 0) {
            nonEmptySplits += 1;
        }
    }
    assertTrue(nonEmptySplits > 2);
}

From source file:edu.ucsb.cs.hadoop.CustomSequenceFileInputFormat.java

License:Apache License

public CustomSequenceFileInputFormat() {
    setMinSplitSize(SequenceFile.SYNC_INTERVAL);
}

From source file:org.apache.asterix.hivecompat.io.RCFileInputFormat.java

License:Apache License

public RCFileInputFormat() {
    setMinSplitSize(SequenceFile.SYNC_INTERVAL);
}

From source file:org.apache.beam.sdk.io.hdfs.HDFSFileSourceTest.java

License:Apache License

@Test
public void testSplitEstimatedSize() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();

    List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0);
    File file = createFileWithData("tmp.avro", expectedResults);

    HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from(file.toString(),
            SequenceFileInputFormat.class, IntWritable.class, Text.class);

    long originalSize = source.getEstimatedSizeBytes(options);
    long splitTotalSize = 0;
    List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source
            .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options);
    for (BoundedSource<KV<IntWritable, Text>> splitSource : splits) {
        splitTotalSize += splitSource.getEstimatedSizeBytes(options);
    }/*from ww w . j a  v a 2  s . co m*/
    // Assert that the estimated size of the whole is the sum of its parts
    assertEquals(originalSize, splitTotalSize);
}

From source file:org.apache.hcatalog.rcfile.RCFileMapReduceInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    job.getConfiguration().setLong("mapred.min.split.size", SequenceFile.SYNC_INTERVAL);
    return super.getSplits(job);
}

From source file:org.apache.hive.hcatalog.rcfile.RCFileMapReduceInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    HiveConf.setLongVar(job.getConfiguration(), HiveConf.ConfVars.MAPREDMINSPLITSIZE,
            SequenceFile.SYNC_INTERVAL);
    return super.getSplits(job);
}