Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.hive.benchmark; import com.facebook.presto.hive.ColumnarBinaryHiveRecordCursorProvider; import com.facebook.presto.hive.ColumnarTextHiveRecordCursorProvider; import com.facebook.presto.hive.GenericHiveRecordCursorProvider; import com.facebook.presto.hive.HdfsEnvironment; import com.facebook.presto.hive.HiveColumnHandle; import com.facebook.presto.hive.HiveCompressionCodec; import com.facebook.presto.hive.HivePageSourceFactory; import com.facebook.presto.hive.HiveRecordCursorProvider; import com.facebook.presto.hive.HiveRecordWriter; import com.facebook.presto.hive.HiveStorageFormat; import com.facebook.presto.hive.HiveType; import com.facebook.presto.hive.HiveTypeTranslator; import com.facebook.presto.hive.TypeTranslator; import com.facebook.presto.hive.orc.DwrfPageSourceFactory; import com.facebook.presto.hive.orc.OrcPageSourceFactory; import com.facebook.presto.hive.parquet.ParquetPageSourceFactory; import com.facebook.presto.hive.parquet.ParquetRecordCursorProvider; import com.facebook.presto.hive.rcfile.RcFilePageSourceFactory; import com.facebook.presto.spi.ConnectorPageSource; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.Page; import com.facebook.presto.spi.RecordCursor; import com.facebook.presto.spi.RecordPageSource; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.TypeManager; import com.facebook.presto.type.TypeRegistry; import com.google.common.base.Throwables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.joda.time.DateTimeZone; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.Properties; import static com.facebook.presto.hive.HdfsConfigurationUpdater.configureCompression; import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR; import static com.facebook.presto.hive.metastore.StorageFormat.fromHiveStorageFormat; import static java.util.stream.Collectors.joining; import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS; import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES; import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; public enum FileFormat { PRESTO_RCBINARY { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HivePageSourceFactory pageSourceFactory = new RcFilePageSourceFactory(TYPE_MANAGER, hdfsEnvironment); return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCBINARY); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.RCBINARY); } }, PRESTO_RCTEXT { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HivePageSourceFactory pageSourceFactory = new RcFilePageSourceFactory(TYPE_MANAGER, hdfsEnvironment); return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCTEXT); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.RCTEXT); } }, PRESTO_ORC { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HivePageSourceFactory pageSourceFactory = new OrcPageSourceFactory(TYPE_MANAGER, false, hdfsEnvironment); return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.ORC); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.ORC); } }, PRESTO_DWRF { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HivePageSourceFactory pageSourceFactory = new DwrfPageSourceFactory(TYPE_MANAGER, hdfsEnvironment); return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.DWRF); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.DWRF); } @Override public boolean supportsDate() { return false; } }, PRESTO_PARQUET { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HivePageSourceFactory pageSourceFactory = new ParquetPageSourceFactory(TYPE_MANAGER, false, hdfsEnvironment); return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.PARQUET); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.PARQUET); } }, HIVE_RCBINARY { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HiveRecordCursorProvider cursorProvider = new ColumnarBinaryHiveRecordCursorProvider(hdfsEnvironment); return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCBINARY); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.RCBINARY); } }, HIVE_RCTEXT { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HiveRecordCursorProvider cursorProvider = new ColumnarTextHiveRecordCursorProvider(hdfsEnvironment); return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCTEXT); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.RCTEXT); } }, HIVE_ORC { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HiveRecordCursorProvider cursorProvider = new GenericHiveRecordCursorProvider(hdfsEnvironment); return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.ORC); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.ORC); } }, HIVE_DWRF { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HiveRecordCursorProvider cursorProvider = new GenericHiveRecordCursorProvider(hdfsEnvironment); return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.DWRF); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.DWRF); } @Override public boolean supportsDate() { return false; } }, HIVE_PARQUET { @Override public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes) { HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false, hdfsEnvironment); return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.PARQUET); } @Override public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException { return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.PARQUET); } }; public boolean supportsDate() { return true; } public abstract ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes); public abstract FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec) throws IOException; private static final TypeManager TYPE_MANAGER = new TypeRegistry(); private static final JobConf conf; static { try { conf = new JobConf(new Configuration(false)); conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); } catch (Exception e) { throw Throwables.propagate(e); } } private static ConnectorPageSource createPageSource(HiveRecordCursorProvider cursorProvider, ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveStorageFormat format) { List<HiveColumnHandle> columnHandles = new ArrayList<>(columnNames.size()); TypeTranslator typeTranslator = new HiveTypeTranslator(); for (int i = 0; i < columnNames.size(); i++) { String columnName = columnNames.get(i); Type columnType = columnTypes.get(i); columnHandles .add(new HiveColumnHandle("test", columnName, HiveType.toHiveType(typeTranslator, columnType), columnType.getTypeSignature(), i, REGULAR, Optional.empty())); } RecordCursor recordCursor = cursorProvider .createRecordCursor("test", conf, session, new Path(targetFile.getAbsolutePath()), 0, targetFile.length(), createSchema(format, columnNames, columnTypes), columnHandles, TupleDomain.all(), DateTimeZone.forID(session.getTimeZoneKey().getId()), TYPE_MANAGER) .get(); return new RecordPageSource(columnTypes, recordCursor); } private static ConnectorPageSource createPageSource(HivePageSourceFactory pageSourceFactory, ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes, HiveStorageFormat format) { List<HiveColumnHandle> columnHandles = new ArrayList<>(columnNames.size()); TypeTranslator typeTranslator = new HiveTypeTranslator(); for (int i = 0; i < columnNames.size(); i++) { String columnName = columnNames.get(i); Type columnType = columnTypes.get(i); columnHandles .add(new HiveColumnHandle("test", columnName, HiveType.toHiveType(typeTranslator, columnType), columnType.getTypeSignature(), i, REGULAR, Optional.empty())); } return pageSourceFactory.createPageSource(conf, session, new Path(targetFile.getAbsolutePath()), 0, targetFile.length(), createSchema(format, columnNames, columnTypes), columnHandles, TupleDomain.all(), DateTimeZone.forID(session.getTimeZoneKey().getId())).get(); } private static class RecordFormatWriter implements FormatWriter { private static final TypeRegistry TYPE_MANAGER = new TypeRegistry(); private final HiveRecordWriter recordWriter; public RecordFormatWriter(File targetFile, List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec, HiveStorageFormat format) { JobConf config = new JobConf(conf); configureCompression(config, compressionCodec); recordWriter = new HiveRecordWriter(new Path(targetFile.toURI()), columnNames, fromHiveStorageFormat(format), createSchema(format, columnNames, columnTypes), TYPE_MANAGER, config); } @Override public void writePage(Page page) { Block[] columns = page.getBlocks(); for (int position = 0; position < page.getPositionCount(); position++) { recordWriter.addRow(columns, position); } } @Override public void close() { recordWriter.commit(); } } private static Properties createSchema(HiveStorageFormat format, List<String> columnNames, List<Type> columnTypes) { Properties schema = new Properties(); TypeTranslator typeTranslator = new HiveTypeTranslator(); schema.setProperty(SERIALIZATION_LIB, format.getSerDe()); schema.setProperty(FILE_INPUT_FORMAT, format.getInputFormat()); schema.setProperty(META_TABLE_COLUMNS, columnNames.stream().collect(joining(","))); schema.setProperty(META_TABLE_COLUMN_TYPES, columnTypes.stream().map(type -> HiveType.toHiveType(typeTranslator, type)) .map(HiveType::getHiveTypeName).collect(joining(":"))); return schema; } }