Java tutorial
/** * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.data.spi.filesystem; import org.kitesdk.data.Signalable; import com.google.common.collect.Lists; import org.kitesdk.data.Dataset; import org.kitesdk.data.DatasetDescriptor; import org.kitesdk.data.DatasetException; import org.kitesdk.data.DatasetReader; import org.kitesdk.data.Format; import org.kitesdk.data.Formats; import org.kitesdk.data.MiniDFSTest; import org.kitesdk.data.URIBuilder; import org.kitesdk.data.ValidationException; import org.kitesdk.data.impl.Accessor; import org.kitesdk.data.spi.PartitionKey; import org.kitesdk.data.PartitionStrategy; import com.google.common.collect.Sets; import com.google.common.io.Files; import java.io.IOException; import java.net.URI; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Set; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.kitesdk.data.CompressionType; import org.kitesdk.data.TestHelpers; import org.kitesdk.data.spi.PartitionedDataset; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.kitesdk.data.spi.filesystem.DatasetTestUtilities.*; import org.kitesdk.data.spi.FieldPartitioner; @RunWith(Parameterized.class) public class TestFileSystemDataset extends MiniDFSTest { private static final Logger LOG = LoggerFactory.getLogger(TestFileSystemDataset.class); @Parameterized.Parameters public static Collection<Object[]> data() throws IOException { MiniDFSTest.setupFS(); Object[][] data = new Object[][] { { Formats.AVRO, getDFS(), CompressionType.Uncompressed }, { Formats.AVRO, getDFS(), CompressionType.Snappy }, { Formats.AVRO, getDFS(), CompressionType.Deflate }, { Formats.AVRO, getDFS(), CompressionType.Bzip2 }, { Formats.AVRO, getFS(), CompressionType.Uncompressed }, { Formats.AVRO, getFS(), CompressionType.Snappy }, { Formats.AVRO, getFS(), CompressionType.Deflate }, { Formats.AVRO, getFS(), CompressionType.Bzip2 }, { Formats.PARQUET, getDFS(), CompressionType.Uncompressed }, { Formats.PARQUET, getDFS(), CompressionType.Snappy }, { Formats.PARQUET, getDFS(), CompressionType.Deflate }, { Formats.PARQUET, getFS(), CompressionType.Uncompressed }, { Formats.PARQUET, getFS(), CompressionType.Snappy }, { Formats.PARQUET, getFS(), CompressionType.Deflate } }; return Arrays.asList(data); } private final Format format; private final FileSystem fileSystem; private final CompressionType compressionType; private Path testDirectory; public TestFileSystemDataset(Format format, FileSystem fs, CompressionType compressionType) { this.format = format; this.fileSystem = fs; this.compressionType = compressionType; } @Before public void setUp() throws IOException { testDirectory = fileSystem.makeQualified(new Path(Files.createTempDir().getAbsolutePath())); } @After public void tearDown() throws IOException { fileSystem.delete(testDirectory, true); } @Test public void testWriteAndRead() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("test") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schemaUri(USER_SCHEMA_URL).format(format) .compressionType(compressionType).location(testDirectory).build()) .type(Record.class).build(); Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor().isPartitioned()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); } @Test @SuppressWarnings("deprecation") public void testPartitionedWriterSingle() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor().isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor().getPartitionStrategy()); writeTestUsers(ds, 10); Assert.assertTrue("Partitioned directory 0 exists", fileSystem.exists(new Path(testDirectory, "username_hash=0"))); Assert.assertTrue("Partitioned directory 1 exists", fileSystem.exists(new Path(testDirectory, "username_hash=1"))); checkTestUsers(ds, 10); PartitionKey key0 = new PartitionKey(0); PartitionKey key1 = new PartitionKey(1); int total = readTestUsersInPartition(ds, key0, null) + readTestUsersInPartition(ds, key1, null); Assert.assertEquals(10, total); testPartitionKeysAreEqual(ds, key0, key1); Set<Record> records = Sets.newHashSet(); for (Dataset dataset : ds.getPartitions()) { Assert.assertFalse("Partitions should not have further partitions", dataset.getDescriptor().isPartitioned()); records.addAll(materialize(ds)); } checkTestUsers(records, 10); } @Test @SuppressWarnings("deprecation") public void testPartitionedWriterSingleNullableField() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_NULLABLE_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor().isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor().getPartitionStrategy()); writeTestUsers(ds, 10); Assert.assertTrue("Partitioned directory 0 exists", fileSystem.exists(new Path(testDirectory, "username_hash=0"))); Assert.assertTrue("Partitioned directory 1 exists", fileSystem.exists(new Path(testDirectory, "username_hash=1"))); checkTestUsers(ds, 10); PartitionKey key0 = new PartitionKey(0); PartitionKey key1 = new PartitionKey(1); int total = readTestUsersInPartition(ds, key0, null) + readTestUsersInPartition(ds, key1, null); Assert.assertEquals(10, total); testPartitionKeysAreEqual(ds, key0, key1); Set<Record> records = Sets.newHashSet(); for (Dataset dataset : ds.getPartitions()) { Assert.assertFalse("Partitions should not have further partitions", dataset.getDescriptor().isPartitioned()); records.addAll(materialize(ds)); } checkTestUsers(records, 10); } @Test @SuppressWarnings("deprecation") public void testPartitionedWriterDouble() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).hash("email", 3) .build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor().isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor().getPartitionStrategy()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); PartitionKey key0 = new PartitionKey(0); PartitionKey key1 = new PartitionKey(1); int total = readTestUsersInPartition(ds, key0, "email_hash") + readTestUsersInPartition(ds, key0, "email_hash"); Assert.assertEquals(10, total); total = 0; for (int i1 = 0; i1 < 2; i1++) { for (int i2 = 0; i2 < 3; i2++) { String part = "username_hash=" + i1 + "/email_hash=" + i2; Assert.assertTrue("Partitioned directory " + part + " exists", fileSystem.exists(new Path(testDirectory, part))); total += readTestUsersInPartition(ds, new PartitionKey(i1, i2), null); } } Assert.assertEquals(10, total); testPartitionKeysAreEqual(ds, key0, key1); Set<Record> records = Sets.newHashSet(); for (Dataset<Record> dataset : ds.getPartitions()) { Assert.assertTrue("Partitions should have further partitions", dataset.getDescriptor().isPartitioned()); records.addAll(materialize(ds)); } checkTestUsers(records, 10); } @Test @SuppressWarnings("deprecation") public void testGetPartitionReturnsNullIfNoAutoCreate() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); Assert.assertNull(ds.getPartition(new PartitionKey(1), false)); } @Test @SuppressWarnings("deprecation") public void testWriteToSubpartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", "username_part", 2) .hash("email", 3).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); PartitionKey key = new PartitionKey(1); FileSystemDataset<Record> userPartition = (FileSystemDataset<Record>) ds.getPartition(key, true); Assert.assertEquals(key, userPartition.getPartitionKey()); writeTestUsers(userPartition, 1); Assert.assertTrue("Partitioned directory exists", fileSystem.exists(new Path(testDirectory, "username_part=1/email_hash=2"))); Assert.assertEquals(1, readTestUsersInPartition(ds, key, "email_hash")); } @Test @SuppressWarnings("deprecation") public void testDropPartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); writeTestUsers(ds, 10); Assert.assertTrue(fileSystem.isDirectory(new Path(testDirectory, "username_hash=0"))); Assert.assertTrue(fileSystem.isDirectory(new Path(testDirectory, "username_hash=1"))); ds.dropPartition(new PartitionKey(0)); Assert.assertFalse(fileSystem.isDirectory(new Path(testDirectory, "username_hash=0"))); ds.dropPartition(new PartitionKey(1)); Assert.assertFalse(fileSystem.isDirectory(new Path(testDirectory, "username_hash=1"))); DatasetException caught = null; try { ds.dropPartition(new PartitionKey(0)); } catch (DatasetException e) { caught = e; } Assert.assertNotNull(caught); } @Test public void testMerge() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); writeTestUsers(ds, 10); checkTestUsers(ds, 10); Path newTestDirectory = fileSystem.makeQualified(new Path(Files.createTempDir().getAbsolutePath())); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(newTestDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); writeTestUsers(dsUpdate, 5, 10); checkTestUsers(dsUpdate, 5, 10); ds.merge(dsUpdate); checkTestUsers(dsUpdate, 0); checkTestUsers(ds, 15); } @Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentFormats() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()).descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA) .format(Formats.AVRO).location(testDirectory).build()) .type(Record.class).build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()).descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA) .format(Formats.PARQUET).location(testDirectory).build()) .type(Record.class).build(); ds.merge(dsUpdate); } @Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentPartitionStrategies() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).location(testDirectory) .partitionStrategy(new PartitionStrategy.Builder().hash("username", 2).build()).build()) .type(Record.class).build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).location(testDirectory) .partitionStrategy( new PartitionStrategy.Builder().hash("username", 2).hash("email", 3).build()) .build()) .type(Record.class).build(); ds.merge(dsUpdate); } @Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentSchemas() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(STRING_SCHEMA).location(testDirectory).build()) .type(Record.class).build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).location(testDirectory).build()) .type(Record.class).build(); ds.merge(dsUpdate); } @Test public void testPathIterator_Directory() { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory).build()) .type(Record.class).build(); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); Assert.assertEquals("dirIterator for non-partitioned dataset should yield a single path.", 1, dirPaths.size()); Assert.assertEquals("dirIterator should yield absolute paths.", testDirectory, dirPaths.get(0)); } @Test @SuppressWarnings("deprecation") public void testPathIterator_Partition_Directory() { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).hash("email", 3) .build(); final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns") .name("partitioned-users").configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .compressionType(compressionType).location(testDirectory) .partitionStrategy(partitionStrategy).build()) .type(Record.class).build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor().isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor().getPartitionStrategy()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); // 2 user directories * 3 email directories Assert.assertEquals(6, dirPaths.size()); Assert.assertTrue("dirIterator should yield absolute paths.", dirPaths.get(0).isAbsolute()); FileSystemDataset<Record> partition = (FileSystemDataset<Record>) ds.getPartition(new PartitionKey(1, 2), false); List<Path> leafPaths = Lists.newArrayList(partition.dirIterator()); Assert.assertEquals(1, leafPaths.size()); final Path leafPath = leafPaths.get(0); Assert.assertTrue("dirIterator should yield absolute paths.", leafPath.isAbsolute()); Assert.assertEquals(new PartitionKey(1, 2), ds.keyFromDirectory(leafPath)); Assert.assertEquals(new PartitionKey(1), ds.keyFromDirectory(leafPath.getParent())); Assert.assertEquals(new PartitionKey(), ds.keyFromDirectory(leafPath.getParent().getParent())); TestHelpers.assertThrows("Path with too many components", IllegalStateException.class, new Runnable() { @Override public void run() { ds.keyFromDirectory(new Path(leafPath, "extra_dir")); } }); TestHelpers.assertThrows("Non-relative path", IllegalStateException.class, new Runnable() { @Override public void run() { ds.keyFromDirectory(new Path("hdfs://different_host/")); } }); } @Test public void testDeleteAllWithoutPartitions() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()).descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA) .format(format).location(testDirectory).build()) .type(Record.class).build(); writeTestUsers(ds, 10); Assert.assertTrue(ds.deleteAll()); checkReaderBehavior(ds.newReader(), 0, (RecordValidator<Record>) null); } @Test public void signalReadyOnUnboundedDataset() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class).uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Unbounded dataset has not been signaled", ds.isReady()); ds.signalReady(); Assert.assertTrue("Unbounded dataset has been signaled and should be ready", ds.isReady()); } @Test public void testReadySignalUpdatesModifiedTime() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>().namespace("ns").name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class).uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Dataset should not be ready before being signaled", ds.isReady()); // the modified time depends on the filesystem, and may only be granular to the second // signal and check until the modified time is after the current time, or until // enough time has past that the signal should have been distinguishable long signaledTime = 0; long currentTime = System.currentTimeMillis(); while (currentTime >= signaledTime && (System.currentTimeMillis() - currentTime) <= 2000) { ds.signalReady(); signaledTime = ds.getLastModified(); } Assert.assertTrue("Dataset should have been signaled as ready", ds.isReady()); Assert.assertTrue("Signal should update the modified time", signaledTime > currentTime); Assert.assertFalse("Only the dataset should have been signaled", ((Signalable) ds.with("username", "bob")).isReady()); } @SuppressWarnings("deprecation") private int readTestUsersInPartition(FileSystemDataset<Record> ds, PartitionKey key, String subpartitionName) { int readCount = 0; DatasetReader<Record> reader = null; try { PartitionedDataset<Record> partition = ds.getPartition(key, false); if (subpartitionName != null) { List<FieldPartitioner> fieldPartitioners = Accessor.getDefault() .getFieldPartitioners(partition.getDescriptor().getPartitionStrategy()); Assert.assertEquals(1, fieldPartitioners.size()); Assert.assertEquals(subpartitionName, fieldPartitioners.get(0).getName()); } reader = partition.newReader(); for (GenericData.Record actualRecord : reader) { Assert.assertEquals(actualRecord.toString(), key.get(0), (actualRecord.get("username").hashCode() & Integer.MAX_VALUE) % 2); if (key.getLength() > 1) { Assert.assertEquals(key.get(1), (actualRecord.get("email").hashCode() & Integer.MAX_VALUE) % 3); } readCount++; } } finally { if (reader != null) { reader.close(); } } return readCount; } }