List of usage examples for org.apache.hadoop.io SequenceFile SYNC_INTERVAL
int SYNC_INTERVAL
To view the source code for org.apache.hadoop.io SequenceFile SYNC_INTERVAL.
Click Source Link
From source file:com.google.cloud.dataflow.contrib.hadoop.HadoopFileSourceTest.java
License:Apache License
@Test public void testSplits() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0); File file = createFileWithData("tmp.avro", expectedResults); HadoopFileSource<IntWritable, Text> source = HadoopFileSource.from(file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); // Assert that the source produces the expected records assertEquals(expectedResults, readFromSource(source, options)); // Split with a small bundle size (has to be at least size of sync interval) List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options); assertTrue(splits.size() > 2);/*from w w w.java2s .co m*/ SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); int nonEmptySplits = 0; for (BoundedSource<KV<IntWritable, Text>> subSource : splits) { if (readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); }
From source file:com.google.cloud.dataflow.sdk.io.hdfs.HDFSFileSourceTest.java
License:Apache License
@Test public void testSplits() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0); File file = createFileWithData("tmp.seq", expectedResults); HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from(file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); // Assert that the source produces the expected records assertEquals(expectedResults, readFromSource(source, options)); // Split with a small bundle size (has to be at least size of sync interval) List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options); assertTrue(splits.size() > 2);// www. ja v a 2 s . c o m SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); int nonEmptySplits = 0; for (BoundedSource<KV<IntWritable, Text>> subSource : splits) { if (readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); }
From source file:edu.ucsb.cs.hadoop.CustomSequenceFileInputFormat.java
License:Apache License
public CustomSequenceFileInputFormat() { setMinSplitSize(SequenceFile.SYNC_INTERVAL); }
From source file:org.apache.asterix.hivecompat.io.RCFileInputFormat.java
License:Apache License
public RCFileInputFormat() { setMinSplitSize(SequenceFile.SYNC_INTERVAL); }
From source file:org.apache.beam.sdk.io.hdfs.HDFSFileSourceTest.java
License:Apache License
@Test public void testSplitEstimatedSize() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0); File file = createFileWithData("tmp.avro", expectedResults); HDFSFileSource<KV<IntWritable, Text>, IntWritable, Text> source = HDFSFileSource.from(file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class); long originalSize = source.getEstimatedSizeBytes(options); long splitTotalSize = 0; List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source .splitIntoBundles(SequenceFile.SYNC_INTERVAL, options); for (BoundedSource<KV<IntWritable, Text>> splitSource : splits) { splitTotalSize += splitSource.getEstimatedSizeBytes(options); }/*from ww w . j a v a 2 s . co m*/ // Assert that the estimated size of the whole is the sum of its parts assertEquals(originalSize, splitTotalSize); }
From source file:org.apache.hcatalog.rcfile.RCFileMapReduceInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { job.getConfiguration().setLong("mapred.min.split.size", SequenceFile.SYNC_INTERVAL); return super.getSplits(job); }
From source file:org.apache.hive.hcatalog.rcfile.RCFileMapReduceInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { HiveConf.setLongVar(job.getConfiguration(), HiveConf.ConfVars.MAPREDMINSPLITSIZE, SequenceFile.SYNC_INTERVAL); return super.getSplits(job); }