com.facebook.presto.hive.benchmark.FileFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.facebook.presto.hive.benchmark.FileFormat.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.hive.benchmark;

import com.facebook.presto.hive.ColumnarBinaryHiveRecordCursorProvider;
import com.facebook.presto.hive.ColumnarTextHiveRecordCursorProvider;
import com.facebook.presto.hive.GenericHiveRecordCursorProvider;
import com.facebook.presto.hive.HdfsEnvironment;
import com.facebook.presto.hive.HiveColumnHandle;
import com.facebook.presto.hive.HiveCompressionCodec;
import com.facebook.presto.hive.HivePageSourceFactory;
import com.facebook.presto.hive.HiveRecordCursorProvider;
import com.facebook.presto.hive.HiveRecordWriter;
import com.facebook.presto.hive.HiveStorageFormat;
import com.facebook.presto.hive.HiveType;
import com.facebook.presto.hive.HiveTypeTranslator;
import com.facebook.presto.hive.TypeTranslator;
import com.facebook.presto.hive.orc.DwrfPageSourceFactory;
import com.facebook.presto.hive.orc.OrcPageSourceFactory;
import com.facebook.presto.hive.parquet.ParquetPageSourceFactory;
import com.facebook.presto.hive.parquet.ParquetRecordCursorProvider;
import com.facebook.presto.hive.rcfile.RcFilePageSourceFactory;
import com.facebook.presto.spi.ConnectorPageSource;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.Page;
import com.facebook.presto.spi.RecordCursor;
import com.facebook.presto.spi.RecordPageSource;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.predicate.TupleDomain;
import com.facebook.presto.spi.type.Type;
import com.facebook.presto.spi.type.TypeManager;
import com.facebook.presto.type.TypeRegistry;
import com.google.common.base.Throwables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.joda.time.DateTimeZone;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Properties;

import static com.facebook.presto.hive.HdfsConfigurationUpdater.configureCompression;
import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR;
import static com.facebook.presto.hive.metastore.StorageFormat.fromHiveStorageFormat;
import static java.util.stream.Collectors.joining;
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT;
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS;
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES;
import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB;

public enum FileFormat {
    PRESTO_RCBINARY {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HivePageSourceFactory pageSourceFactory = new RcFilePageSourceFactory(TYPE_MANAGER, hdfsEnvironment);
            return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.RCBINARY);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.RCBINARY);
        }
    },

    PRESTO_RCTEXT {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HivePageSourceFactory pageSourceFactory = new RcFilePageSourceFactory(TYPE_MANAGER, hdfsEnvironment);
            return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.RCTEXT);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.RCTEXT);
        }
    },

    PRESTO_ORC {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HivePageSourceFactory pageSourceFactory = new OrcPageSourceFactory(TYPE_MANAGER, false,
                    hdfsEnvironment);
            return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.ORC);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.ORC);
        }
    },

    PRESTO_DWRF {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HivePageSourceFactory pageSourceFactory = new DwrfPageSourceFactory(TYPE_MANAGER, hdfsEnvironment);
            return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.DWRF);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.DWRF);
        }

        @Override
        public boolean supportsDate() {
            return false;
        }
    },

    PRESTO_PARQUET {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HivePageSourceFactory pageSourceFactory = new ParquetPageSourceFactory(TYPE_MANAGER, false,
                    hdfsEnvironment);
            return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.PARQUET);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.PARQUET);
        }
    },

    HIVE_RCBINARY {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HiveRecordCursorProvider cursorProvider = new ColumnarBinaryHiveRecordCursorProvider(hdfsEnvironment);
            return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.RCBINARY);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.RCBINARY);
        }
    },

    HIVE_RCTEXT {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HiveRecordCursorProvider cursorProvider = new ColumnarTextHiveRecordCursorProvider(hdfsEnvironment);
            return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.RCTEXT);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.RCTEXT);
        }
    },

    HIVE_ORC {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HiveRecordCursorProvider cursorProvider = new GenericHiveRecordCursorProvider(hdfsEnvironment);
            return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.ORC);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.ORC);
        }
    },

    HIVE_DWRF {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HiveRecordCursorProvider cursorProvider = new GenericHiveRecordCursorProvider(hdfsEnvironment);
            return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.DWRF);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.DWRF);
        }

        @Override
        public boolean supportsDate() {
            return false;
        }
    },

    HIVE_PARQUET {
        @Override
        public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment,
                File targetFile, List<String> columnNames, List<Type> columnTypes) {
            HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false, hdfsEnvironment);
            return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes,
                    HiveStorageFormat.PARQUET);
        }

        @Override
        public FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
                List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
                throws IOException {
            return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec,
                    HiveStorageFormat.PARQUET);
        }
    };

    public boolean supportsDate() {
        return true;
    }

    public abstract ConnectorPageSource createFileFormatReader(ConnectorSession session,
            HdfsEnvironment hdfsEnvironment, File targetFile, List<String> columnNames, List<Type> columnTypes);

    public abstract FormatWriter createFileFormatWriter(ConnectorSession session, File targetFile,
            List<String> columnNames, List<Type> columnTypes, HiveCompressionCodec compressionCodec)
            throws IOException;

    private static final TypeManager TYPE_MANAGER = new TypeRegistry();
    private static final JobConf conf;

    static {
        try {
            conf = new JobConf(new Configuration(false));
            conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    private static ConnectorPageSource createPageSource(HiveRecordCursorProvider cursorProvider,
            ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes,
            HiveStorageFormat format) {
        List<HiveColumnHandle> columnHandles = new ArrayList<>(columnNames.size());
        TypeTranslator typeTranslator = new HiveTypeTranslator();
        for (int i = 0; i < columnNames.size(); i++) {
            String columnName = columnNames.get(i);
            Type columnType = columnTypes.get(i);
            columnHandles
                    .add(new HiveColumnHandle("test", columnName, HiveType.toHiveType(typeTranslator, columnType),
                            columnType.getTypeSignature(), i, REGULAR, Optional.empty()));
        }

        RecordCursor recordCursor = cursorProvider
                .createRecordCursor("test", conf, session, new Path(targetFile.getAbsolutePath()), 0,
                        targetFile.length(), createSchema(format, columnNames, columnTypes), columnHandles,
                        TupleDomain.all(), DateTimeZone.forID(session.getTimeZoneKey().getId()), TYPE_MANAGER)
                .get();
        return new RecordPageSource(columnTypes, recordCursor);
    }

    private static ConnectorPageSource createPageSource(HivePageSourceFactory pageSourceFactory,
            ConnectorSession session, File targetFile, List<String> columnNames, List<Type> columnTypes,
            HiveStorageFormat format) {
        List<HiveColumnHandle> columnHandles = new ArrayList<>(columnNames.size());
        TypeTranslator typeTranslator = new HiveTypeTranslator();
        for (int i = 0; i < columnNames.size(); i++) {
            String columnName = columnNames.get(i);
            Type columnType = columnTypes.get(i);
            columnHandles
                    .add(new HiveColumnHandle("test", columnName, HiveType.toHiveType(typeTranslator, columnType),
                            columnType.getTypeSignature(), i, REGULAR, Optional.empty()));
        }

        return pageSourceFactory.createPageSource(conf, session, new Path(targetFile.getAbsolutePath()), 0,
                targetFile.length(), createSchema(format, columnNames, columnTypes), columnHandles,
                TupleDomain.all(), DateTimeZone.forID(session.getTimeZoneKey().getId())).get();
    }

    private static class RecordFormatWriter implements FormatWriter {
        private static final TypeRegistry TYPE_MANAGER = new TypeRegistry();
        private final HiveRecordWriter recordWriter;

        public RecordFormatWriter(File targetFile, List<String> columnNames, List<Type> columnTypes,
                HiveCompressionCodec compressionCodec, HiveStorageFormat format) {
            JobConf config = new JobConf(conf);
            configureCompression(config, compressionCodec);

            recordWriter = new HiveRecordWriter(new Path(targetFile.toURI()), columnNames,
                    fromHiveStorageFormat(format), createSchema(format, columnNames, columnTypes), TYPE_MANAGER,
                    config);
        }

        @Override
        public void writePage(Page page) {
            Block[] columns = page.getBlocks();
            for (int position = 0; position < page.getPositionCount(); position++) {
                recordWriter.addRow(columns, position);
            }
        }

        @Override
        public void close() {
            recordWriter.commit();
        }
    }

    private static Properties createSchema(HiveStorageFormat format, List<String> columnNames,
            List<Type> columnTypes) {
        Properties schema = new Properties();
        TypeTranslator typeTranslator = new HiveTypeTranslator();
        schema.setProperty(SERIALIZATION_LIB, format.getSerDe());
        schema.setProperty(FILE_INPUT_FORMAT, format.getInputFormat());
        schema.setProperty(META_TABLE_COLUMNS, columnNames.stream().collect(joining(",")));
        schema.setProperty(META_TABLE_COLUMN_TYPES,
                columnTypes.stream().map(type -> HiveType.toHiveType(typeTranslator, type))
                        .map(HiveType::getHiveTypeName).collect(joining(":")));
        return schema;
    }
}