io.prestosql.orc.metadata.OrcMetadataWriter.java Source code

Java tutorial

Introduction

Here is the source code for io.prestosql.orc.metadata.OrcMetadataWriter.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.prestosql.orc.metadata;

import com.google.common.collect.ImmutableList;
import com.google.common.io.CountingOutputStream;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;
import io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind;
import io.prestosql.orc.metadata.OrcType.OrcTypeKind;
import io.prestosql.orc.metadata.Stream.StreamKind;
import io.prestosql.orc.metadata.statistics.ColumnStatistics;
import io.prestosql.orc.metadata.statistics.StripeStatistics;
import io.prestosql.orc.proto.OrcProto;
import io.prestosql.orc.proto.OrcProto.RowIndexEntry;
import io.prestosql.orc.proto.OrcProto.Type;
import io.prestosql.orc.proto.OrcProto.Type.Builder;
import io.prestosql.orc.proto.OrcProto.UserMetadataItem;
import io.prestosql.orc.protobuf.ByteString;
import io.prestosql.orc.protobuf.MessageLite;

import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import java.util.Map.Entry;

import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.Math.toIntExact;
import static java.util.stream.Collectors.toList;

public class OrcMetadataWriter implements MetadataWriter {
    // see https://github.com/prestosql/orc-protobuf/blob/master/src/main/protobuf/orc_proto.proto
    private static final int PRESTO_WRITER_ID = 2;
    // in order to change this value, the master Apache ORC proto file must be updated
    private static final int ORC_WRITER_VERSION = 6;
    private static final List<Integer> ORC_METADATA_VERSION = ImmutableList.of(0, 12);

    @Override
    public List<Integer> getOrcMetadataVersion() {
        return ORC_METADATA_VERSION;
    }

    @Override
    public int writePostscript(SliceOutput output, int footerLength, int metadataLength,
            CompressionKind compression, int compressionBlockSize) throws IOException {
        OrcProto.PostScript postScriptProtobuf = OrcProto.PostScript.newBuilder()
                .addAllVersion(ORC_METADATA_VERSION).setFooterLength(footerLength).setMetadataLength(metadataLength)
                .setCompression(toCompression(compression)).setCompressionBlockSize(compressionBlockSize)
                .setWriterVersion(ORC_WRITER_VERSION).build();

        return writeProtobufObject(output, postScriptProtobuf);
    }

    @Override
    public int writeMetadata(SliceOutput output, Metadata metadata) throws IOException {
        OrcProto.Metadata metadataProtobuf = OrcProto.Metadata.newBuilder().addAllStripeStats(
                metadata.getStripeStatsList().stream().map(OrcMetadataWriter::toStripeStatistics).collect(toList()))
                .build();

        return writeProtobufObject(output, metadataProtobuf);
    }

    private static OrcProto.StripeStatistics toStripeStatistics(StripeStatistics stripeStatistics) {
        return OrcProto.StripeStatistics.newBuilder().addAllColStats(stripeStatistics.getColumnStatistics().stream()
                .map(OrcMetadataWriter::toColumnStatistics).collect(toList())).build();
    }

    @Override
    public int writeFooter(SliceOutput output, Footer footer) throws IOException {
        OrcProto.Footer footerProtobuf = OrcProto.Footer.newBuilder().setWriter(PRESTO_WRITER_ID)
                .setNumberOfRows(footer.getNumberOfRows()).setRowIndexStride(footer.getRowsInRowGroup())
                .addAllStripes(
                        footer.getStripes().stream().map(OrcMetadataWriter::toStripeInformation).collect(toList()))
                .addAllTypes(footer.getTypes().stream().map(OrcMetadataWriter::toType).collect(toList()))
                .addAllStatistics(
                        footer.getFileStats().stream().map(OrcMetadataWriter::toColumnStatistics).collect(toList()))
                .addAllMetadata(footer.getUserMetadata().entrySet().stream().map(OrcMetadataWriter::toUserMetadata)
                        .collect(toList()))
                .build();

        return writeProtobufObject(output, footerProtobuf);
    }

    private static OrcProto.StripeInformation toStripeInformation(StripeInformation stripe) {
        return OrcProto.StripeInformation.newBuilder().setNumberOfRows(stripe.getNumberOfRows())
                .setOffset(stripe.getOffset()).setIndexLength(stripe.getIndexLength())
                .setDataLength(stripe.getDataLength()).setFooterLength(stripe.getFooterLength()).build();
    }

    private static Type toType(OrcType type) {
        Builder builder = Type.newBuilder().setKind(toTypeKind(type.getOrcTypeKind()))
                .addAllSubtypes(type.getFieldTypeIndexes()).addAllFieldNames(type.getFieldNames());

        if (type.getLength().isPresent()) {
            builder.setMaximumLength(type.getLength().get());
        }
        if (type.getPrecision().isPresent()) {
            builder.setPrecision(type.getPrecision().get());
        }
        if (type.getScale().isPresent()) {
            builder.setScale(type.getScale().get());
        }
        return builder.build();
    }

    private static OrcProto.Type.Kind toTypeKind(OrcTypeKind orcTypeKind) {
        switch (orcTypeKind) {
        case BOOLEAN:
            return OrcProto.Type.Kind.BOOLEAN;
        case BYTE:
            return OrcProto.Type.Kind.BYTE;
        case SHORT:
            return OrcProto.Type.Kind.SHORT;
        case INT:
            return OrcProto.Type.Kind.INT;
        case LONG:
            return OrcProto.Type.Kind.LONG;
        case DECIMAL:
            return OrcProto.Type.Kind.DECIMAL;
        case FLOAT:
            return OrcProto.Type.Kind.FLOAT;
        case DOUBLE:
            return OrcProto.Type.Kind.DOUBLE;
        case STRING:
            return OrcProto.Type.Kind.STRING;
        case VARCHAR:
            return OrcProto.Type.Kind.VARCHAR;
        case CHAR:
            return OrcProto.Type.Kind.CHAR;
        case BINARY:
            return OrcProto.Type.Kind.BINARY;
        case DATE:
            return OrcProto.Type.Kind.DATE;
        case TIMESTAMP:
            return OrcProto.Type.Kind.TIMESTAMP;
        case LIST:
            return OrcProto.Type.Kind.LIST;
        case MAP:
            return OrcProto.Type.Kind.MAP;
        case STRUCT:
            return OrcProto.Type.Kind.STRUCT;
        case UNION:
            return OrcProto.Type.Kind.UNION;
        }
        throw new IllegalArgumentException("Unsupported type: " + orcTypeKind);
    }

    private static OrcProto.ColumnStatistics toColumnStatistics(ColumnStatistics columnStatistics) {
        OrcProto.ColumnStatistics.Builder builder = OrcProto.ColumnStatistics.newBuilder();

        if (columnStatistics.hasNumberOfValues()) {
            builder.setNumberOfValues(columnStatistics.getNumberOfValues());
        }

        if (columnStatistics.getBooleanStatistics() != null) {
            builder.setBucketStatistics(OrcProto.BucketStatistics.newBuilder()
                    .addCount(columnStatistics.getBooleanStatistics().getTrueValueCount()).build());
        }

        if (columnStatistics.getIntegerStatistics() != null) {
            OrcProto.IntegerStatistics.Builder integerStatistics = OrcProto.IntegerStatistics.newBuilder()
                    .setMinimum(columnStatistics.getIntegerStatistics().getMin())
                    .setMaximum(columnStatistics.getIntegerStatistics().getMax());
            if (columnStatistics.getIntegerStatistics().getSum() != null) {
                integerStatistics.setSum(columnStatistics.getIntegerStatistics().getSum());
            }
            builder.setIntStatistics(integerStatistics.build());
        }

        if (columnStatistics.getDoubleStatistics() != null) {
            builder.setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
                    .setMinimum(columnStatistics.getDoubleStatistics().getMin())
                    .setMaximum(columnStatistics.getDoubleStatistics().getMax()).build());
        }

        if (columnStatistics.getStringStatistics() != null) {
            OrcProto.StringStatistics.Builder statisticsBuilder = OrcProto.StringStatistics.newBuilder();
            if (columnStatistics.getStringStatistics().getMin() != null) {
                statisticsBuilder.setMinimumBytes(
                        ByteString.copyFrom(columnStatistics.getStringStatistics().getMin().getBytes()));
            }
            if (columnStatistics.getStringStatistics().getMax() != null) {
                statisticsBuilder.setMaximumBytes(
                        ByteString.copyFrom(columnStatistics.getStringStatistics().getMax().getBytes()));
            }
            statisticsBuilder.setSum(columnStatistics.getStringStatistics().getSum());
            builder.setStringStatistics(statisticsBuilder.build());
        }

        if (columnStatistics.getDateStatistics() != null) {
            builder.setDateStatistics(
                    OrcProto.DateStatistics.newBuilder().setMinimum(columnStatistics.getDateStatistics().getMin())
                            .setMaximum(columnStatistics.getDateStatistics().getMax()).build());
        }

        if (columnStatistics.getDecimalStatistics() != null) {
            builder.setDecimalStatistics(OrcProto.DecimalStatistics.newBuilder()
                    .setMinimum(columnStatistics.getDecimalStatistics().getMin().toString())
                    .setMaximum(columnStatistics.getDecimalStatistics().getMax().toString()).build());
        }

        if (columnStatistics.getBinaryStatistics() != null) {
            builder.setBinaryStatistics(OrcProto.BinaryStatistics.newBuilder()
                    .setSum(columnStatistics.getBinaryStatistics().getSum()).build());
        }

        return builder.build();
    }

    private static UserMetadataItem toUserMetadata(Entry<String, Slice> entry) {
        return OrcProto.UserMetadataItem.newBuilder().setName(entry.getKey())
                .setValue(ByteString.copyFrom(entry.getValue().getBytes())).build();
    }

    @Override
    public int writeStripeFooter(SliceOutput output, StripeFooter footer) throws IOException {
        OrcProto.StripeFooter footerProtobuf = OrcProto.StripeFooter.newBuilder()
                .addAllStreams(footer.getStreams().stream().map(OrcMetadataWriter::toStream).collect(toList()))
                .addAllColumns(footer.getColumnEncodings().stream().map(OrcMetadataWriter::toColumnEncoding)
                        .collect(toList()))
                .build();

        return writeProtobufObject(output, footerProtobuf);
    }

    private static OrcProto.Stream toStream(Stream stream) {
        return OrcProto.Stream.newBuilder().setColumn(stream.getColumn())
                .setKind(toStreamKind(stream.getStreamKind())).setLength(stream.getLength()).build();
    }

    private static OrcProto.Stream.Kind toStreamKind(StreamKind streamKind) {
        switch (streamKind) {
        case PRESENT:
            return OrcProto.Stream.Kind.PRESENT;
        case DATA:
            return OrcProto.Stream.Kind.DATA;
        case LENGTH:
            return OrcProto.Stream.Kind.LENGTH;
        case DICTIONARY_DATA:
            return OrcProto.Stream.Kind.DICTIONARY_DATA;
        case DICTIONARY_COUNT:
            return OrcProto.Stream.Kind.DICTIONARY_COUNT;
        case SECONDARY:
            return OrcProto.Stream.Kind.SECONDARY;
        case ROW_INDEX:
            return OrcProto.Stream.Kind.ROW_INDEX;
        }
        throw new IllegalArgumentException("Unsupported stream kind: " + streamKind);
    }

    private static OrcProto.ColumnEncoding toColumnEncoding(ColumnEncoding columnEncodings) {
        checkArgument(!columnEncodings.getAdditionalSequenceEncodings().isPresent(),
                "Writing columns with non-zero sequence IDs is not supported in ORC: " + columnEncodings);

        return OrcProto.ColumnEncoding.newBuilder()
                .setKind(toColumnEncoding(columnEncodings.getColumnEncodingKind()))
                .setDictionarySize(columnEncodings.getDictionarySize()).build();
    }

    private static OrcProto.ColumnEncoding.Kind toColumnEncoding(ColumnEncodingKind columnEncodingKind) {
        switch (columnEncodingKind) {
        case DIRECT:
            return OrcProto.ColumnEncoding.Kind.DIRECT;
        case DICTIONARY:
            return OrcProto.ColumnEncoding.Kind.DICTIONARY;
        case DIRECT_V2:
            return OrcProto.ColumnEncoding.Kind.DIRECT_V2;
        case DICTIONARY_V2:
            return OrcProto.ColumnEncoding.Kind.DICTIONARY_V2;
        }
        throw new IllegalArgumentException("Unsupported column encoding kind: " + columnEncodingKind);
    }

    @Override
    public int writeRowIndexes(SliceOutput output, List<RowGroupIndex> rowGroupIndexes) throws IOException {
        OrcProto.RowIndex rowIndexProtobuf = OrcProto.RowIndex.newBuilder()
                .addAllEntry(rowGroupIndexes.stream().map(OrcMetadataWriter::toRowGroupIndex).collect(toList()))
                .build();
        return writeProtobufObject(output, rowIndexProtobuf);
    }

    private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex) {
        return OrcProto.RowIndexEntry.newBuilder()
                .addAllPositions(rowGroupIndex.getPositions().stream().map(Integer::longValue).collect(toList()))
                .setStatistics(toColumnStatistics(rowGroupIndex.getColumnStatistics())).build();
    }

    private static OrcProto.CompressionKind toCompression(CompressionKind compressionKind) {
        switch (compressionKind) {
        case NONE:
            return OrcProto.CompressionKind.NONE;
        case ZLIB:
            return OrcProto.CompressionKind.ZLIB;
        case SNAPPY:
            return OrcProto.CompressionKind.SNAPPY;
        case LZ4:
            return OrcProto.CompressionKind.LZ4;
        case ZSTD:
            return OrcProto.CompressionKind.ZSTD;
        }
        throw new IllegalArgumentException("Unsupported compression kind: " + compressionKind);
    }

    private static int writeProtobufObject(OutputStream output, MessageLite object) throws IOException {
        CountingOutputStream countingOutput = new CountingOutputStream(output);
        object.writeTo(countingOutput);
        return toIntExact(countingOutput.getCount());
    }
}