Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.hive; import com.facebook.presto.hive.authentication.NoHdfsAuthentication; import com.facebook.presto.hive.metastore.Column; import com.facebook.presto.hive.metastore.StorageFormat; import com.facebook.presto.hive.metastore.Table; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.SchemaTableName; import com.facebook.presto.spi.predicate.Domain; import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.testing.TestingConnectorSession; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.airlift.stats.CounterStat; import io.airlift.units.DataSize; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.util.Progressable; import org.testng.annotations.Test; import java.net.URI; import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.concurrent.Executor; import static com.facebook.presto.hive.HiveBucketing.HiveBucket; import static com.facebook.presto.hive.HiveColumnHandle.pathColumnHandle; import static com.facebook.presto.hive.HiveTestUtils.SESSION; import static com.facebook.presto.hive.HiveType.HIVE_INT; import static com.facebook.presto.hive.HiveType.HIVE_STRING; import static com.facebook.presto.hive.HiveUtil.getRegularColumnHandles; import static com.facebook.presto.spi.predicate.TupleDomain.withColumnDomains; import static com.facebook.presto.spi.type.VarcharType.VARCHAR; import static com.google.common.collect.ImmutableList.toImmutableList; import static io.airlift.concurrent.Threads.daemonThreadsNamed; import static io.airlift.slice.Slices.utf8Slice; import static io.airlift.units.DataSize.Unit.GIGABYTE; import static io.airlift.units.DataSize.Unit.MEGABYTE; import static java.util.concurrent.Executors.newCachedThreadPool; import static org.testng.Assert.assertEquals; public class TestBackgroundHiveSplitLoader { private static final int BUCKET_COUNT = 2; private static final String SAMPLE_PATH = "hdfs://VOL1:9000/db_name/table_name/000000_0"; private static final String SAMPLE_PATH_FILTERED = "hdfs://VOL1:9000/db_name/table_name/000000_1"; private static final String TEST_CONNECTOR_ID = "test_connector"; private static final Path RETURNED_PATH = new Path(SAMPLE_PATH); private static final Path FILTERED_PATH = new Path(SAMPLE_PATH_FILTERED); private static final Executor EXECUTOR = newCachedThreadPool(daemonThreadsNamed("test-%s")); private static final TupleDomain<HiveColumnHandle> RETURNED_PATH_DOMAIN = withColumnDomains( ImmutableMap.of(pathColumnHandle(), Domain.singleValue(VARCHAR, utf8Slice(RETURNED_PATH.toString())))); private static final List<LocatedFileStatus> TEST_FILES = ImmutableList.of(locatedFileStatus(RETURNED_PATH), locatedFileStatus(FILTERED_PATH)); private static final List<Column> PARTITION_COLUMNS = ImmutableList .of(new Column("partitionColumn", HIVE_INT, Optional.empty())); private static final Optional<HiveBucketProperty> BUCKET_PROPERTY = Optional .of(new HiveBucketProperty(ImmutableList.of("col1"), BUCKET_COUNT)); private static final Table SIMPLE_TABLE = table(ImmutableList.of(), Optional.empty()); private static final Table PARTITIONED_TABLE = table(PARTITION_COLUMNS, BUCKET_PROPERTY); @Test public void testNoPathFilter() throws Exception { BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader(TEST_FILES, TupleDomain.none()); HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader, TupleDomain.none()); backgroundHiveSplitLoader.start(hiveSplitSource); assertEquals(drain(hiveSplitSource).size(), 2); } @Test public void testPathFilter() throws Exception { BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader(TEST_FILES, RETURNED_PATH_DOMAIN); HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader, RETURNED_PATH_DOMAIN); backgroundHiveSplitLoader.start(hiveSplitSource); List<String> paths = drain(hiveSplitSource); assertEquals(paths.size(), 1); assertEquals(paths.get(0), RETURNED_PATH.toString()); } @Test public void testPathFilterOneBucketMatchPartitionedTable() throws Exception { BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader(TEST_FILES, RETURNED_PATH_DOMAIN, ImmutableList.of(new HiveBucket(0, BUCKET_COUNT), new HiveBucket(1, BUCKET_COUNT)), PARTITIONED_TABLE, Optional.empty()); HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader, RETURNED_PATH_DOMAIN); backgroundHiveSplitLoader.start(hiveSplitSource); List<String> paths = drain(hiveSplitSource); assertEquals(paths.size(), 1); assertEquals(paths.get(0), RETURNED_PATH.toString()); } @Test public void testPathFilterBucketedPartitionedTable() throws Exception { BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader(TEST_FILES, RETURNED_PATH_DOMAIN, ImmutableList.of(), PARTITIONED_TABLE, Optional.of(new HiveBucketHandle(getRegularColumnHandles(PARTITIONED_TABLE), BUCKET_COUNT))); HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader, RETURNED_PATH_DOMAIN); backgroundHiveSplitLoader.start(hiveSplitSource); List<String> paths = drain(hiveSplitSource); assertEquals(paths.size(), 1); assertEquals(paths.get(0), RETURNED_PATH.toString()); } @Test public void testEmptyFileWithNoBlocks() throws Exception { BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( ImmutableList.of(locatedFileStatusWithNoBlocks(RETURNED_PATH)), TupleDomain.none()); HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader, TupleDomain.none()); backgroundHiveSplitLoader.start(hiveSplitSource); List<HiveSplit> splits = drainSplits(hiveSplitSource); assertEquals(splits.size(), 1); assertEquals(splits.get(0).getPath(), RETURNED_PATH.toString()); assertEquals(splits.get(0).getLength(), 0); } private List<String> drain(HiveSplitSource source) throws Exception { return drainSplits(source).stream().map(HiveSplit::getPath).collect(toImmutableList()); } private List<HiveSplit> drainSplits(HiveSplitSource source) throws Exception { ImmutableList.Builder<HiveSplit> splits = ImmutableList.builder(); while (!source.isFinished()) { source.getNextBatch(100).get().stream().map(HiveSplit.class::cast).forEach(splits::add); } return splits.build(); } private static BackgroundHiveSplitLoader backgroundHiveSplitLoader(List<LocatedFileStatus> files, TupleDomain<HiveColumnHandle> tupleDomain) { return backgroundHiveSplitLoader(files, tupleDomain, ImmutableList.of(), SIMPLE_TABLE, Optional.empty()); } private static BackgroundHiveSplitLoader backgroundHiveSplitLoader(List<LocatedFileStatus> files, TupleDomain<HiveColumnHandle> compactEffectivePredicate, List<HiveBucket> hiveBuckets, Table table, Optional<HiveBucketHandle> bucketHandle) { List<HivePartitionMetadata> hivePartitionMetadatas = ImmutableList.of(new HivePartitionMetadata( new HivePartition(new SchemaTableName("testSchema", "table_name"), ImmutableList.of()), Optional.empty(), ImmutableMap.of())); ConnectorSession connectorSession = new TestingConnectorSession( new HiveSessionProperties(new HiveClientConfig().setMaxSplitSize(new DataSize(1.0, GIGABYTE))) .getSessionProperties()); return new BackgroundHiveSplitLoader(table, hivePartitionMetadatas, compactEffectivePredicate, bucketHandle, hiveBuckets, connectorSession, new TestingHdfsEnvironment(), new NamenodeStats(), new TestingDirectoryLister(files), EXECUTOR, 2, false); } private static HiveSplitSource hiveSplitSource(BackgroundHiveSplitLoader backgroundHiveSplitLoader, TupleDomain<HiveColumnHandle> compactEffectivePredicate) { return new HiveSplitSource(SESSION, SIMPLE_TABLE.getDatabaseName(), SIMPLE_TABLE.getTableName(), compactEffectivePredicate, 1, 1, new DataSize(32, MEGABYTE), backgroundHiveSplitLoader, EXECUTOR, new CounterStat()); } private static Table table(List<Column> partitionColumns, Optional<HiveBucketProperty> bucketProperty) { Table.Builder tableBuilder = Table.builder(); tableBuilder.getStorageBuilder() .setStorageFormat(StorageFormat.create("com.facebook.hive.orc.OrcSerde", "org.apache.hadoop.hive.ql.io.RCFileInputFormat", "org.apache.hadoop.hive.ql.io.RCFileInputFormat")) .setLocation("hdfs://VOL1:9000/db_name/table_name").setSkewed(false) .setBucketProperty(bucketProperty).setSorted(false); return tableBuilder.setDatabaseName("test_dbname").setOwner("testOwner").setTableName("test_table") .setTableType(TableType.MANAGED_TABLE.toString()) .setDataColumns(ImmutableList.of(new Column("col1", HIVE_STRING, Optional.empty()))) .setParameters(ImmutableMap.of()).setPartitionColumns(partitionColumns).build(); } private static LocatedFileStatus locatedFileStatus(Path path) { return new LocatedFileStatus(0L, false, 0, 0L, 0L, 0L, null, null, null, null, path, new BlockLocation[] { new BlockLocation() }); } private static LocatedFileStatus locatedFileStatusWithNoBlocks(Path path) { return new LocatedFileStatus(0L, false, 0, 0L, 0L, 0L, null, null, null, null, path, new BlockLocation[] {}); } private static class TestingDirectoryLister implements DirectoryLister { private final List<LocatedFileStatus> files; public TestingDirectoryLister(List<LocatedFileStatus> files) { this.files = files; } @Override public RemoteIterator<LocatedFileStatus> list(FileSystem fs, Path path) { return new RemoteIterator<LocatedFileStatus>() { private final Iterator<LocatedFileStatus> iterator = files.iterator(); @Override public boolean hasNext() { return iterator.hasNext(); } @Override public LocatedFileStatus next() { return iterator.next(); } }; } } private static class TestingHdfsEnvironment extends HdfsEnvironment { public TestingHdfsEnvironment() { super(new HiveHdfsConfiguration(new HdfsConfigurationUpdater(new HiveClientConfig())), new HiveClientConfig(), new NoHdfsAuthentication()); } @Override public FileSystem getFileSystem(String user, Path path, Configuration configuration) { return new TestingHdfsFileSystem(); } } private static class TestingHdfsFileSystem extends FileSystem { @Override public boolean delete(Path f, boolean recursive) { throw new UnsupportedOperationException(); } @Override public boolean rename(Path src, Path dst) { throw new UnsupportedOperationException(); } @Override public void setWorkingDirectory(Path dir) { throw new UnsupportedOperationException(); } @Override public FileStatus[] listStatus(Path f) { throw new UnsupportedOperationException(); } @Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) { throw new UnsupportedOperationException(); } @Override public boolean mkdirs(Path f, FsPermission permission) { throw new UnsupportedOperationException(); } @Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) { throw new UnsupportedOperationException(); } @Override public FSDataInputStream open(Path f, int buffersize) { throw new UnsupportedOperationException(); } @Override public FileStatus getFileStatus(Path f) { throw new UnsupportedOperationException(); } @Override public Path getWorkingDirectory() { throw new UnsupportedOperationException(); } @Override public URI getUri() { throw new UnsupportedOperationException(); } } }