org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector.java Source code

Introduction

Here is the source code for org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.store.parquet.stat;

import com.google.common.base.Stopwatch;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.store.ParquetOutputRecordWriter;
import org.apache.drill.exec.store.parquet.ParquetGroupScan;
import org.apache.drill.exec.store.parquet.ParquetReaderUtility;
import org.apache.drill.exec.store.parquet.columnreaders.ParquetToDrillTypeConverter;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.column.statistics.IntStatistics;
import org.apache.parquet.column.statistics.LongStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.format.SchemaElement;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.joda.time.DateTimeConstants;
import org.joda.time.DateTimeUtils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

public class ParquetFooterStatCollector implements ColumnStatCollector {
    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ParquetFooterStatCollector.class);

    private final ParquetMetadata footer;
    private final int rowGroupIndex;
    private final OptionManager options;
    private final Map<String, String> implicitColValues;
    private final boolean autoCorrectCorruptDates;

    public ParquetFooterStatCollector(ParquetMetadata footer, int rowGroupIndex,
            Map<String, String> implicitColValues, boolean autoCorrectCorruptDates, OptionManager options) {
        this.footer = footer;
        this.rowGroupIndex = rowGroupIndex;

        // Reasons to pass implicit columns and their values:
        // 1. Differentiate implicit columns from regular non-exist columns. Implicit columns do not
        //    exist in parquet metadata. Without such knowledge, implicit columns is treated as non-exist
        //    column.  A condition on non-exist column would lead to canDrop = true, which is not the
        //    right behavior for condition on implicit columns.

        // 2. Pass in the implicit column name with corresponding values, and wrap them in Statistics with
        //    min and max having same value. This expands the possibility of pruning.
        //    For example, regCol = 5 or dir0 = 1995. If regCol is not a partition column, we would not do
        //    any partition pruning in the current partition pruning logical. Pass the implicit column values
        //    may allow us to prune some row groups using condition regCol = 5 or dir0 = 1995.

        this.implicitColValues = implicitColValues;
        this.autoCorrectCorruptDates = autoCorrectCorruptDates;
        this.options = options;
    }

    @Override
    public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
        Stopwatch timer = Stopwatch.createStarted();

        ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility
                .detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);

        // map from column name to ColumnDescriptor
        Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();

        // map from column name to ColumnChunkMetaData
        final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();

        // map from column name to MajorType
        final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();

        // map from column name to SchemaElement
        final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();

        // map from column name to column statistics.
        final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();

        final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter()
                .toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

        for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
            final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
            if (fields.contains(schemaPath)) {
                columnDescMap.put(schemaPath, column);
            }
        }

        for (final SchemaElement se : fileMetaData.getSchema()) {
            final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
            if (fields.contains(schemaPath)) {
                schemaElementMap.put(schemaPath, se);
            }
        }

        for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
            final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
            if (fields.contains(schemaPath)) {
                columnChkMetaMap.put(schemaPath, colMetaData);
            }
        }

        for (final SchemaPath path : fields) {
            if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path)
                    && columnChkMetaMap.containsKey(path)) {
                ColumnDescriptor columnDesc = columnDescMap.get(path);
                SchemaElement se = schemaElementMap.get(path);
                ColumnChunkMetaData metaData = columnChkMetaMap.get(path);

                TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(),
                        se.getType_length(), getDataMode(columnDesc), se, options);

                columnTypeMap.put(path, type);

                Statistics stat = metaData.getStatistics();
                if (type.getMinorType() == TypeProtos.MinorType.DATE) {
                    stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
                }

                statMap.put(path, new ColumnStatistics(stat, type));
            } else {
                final String columnName = path.getRootSegment().getPath();
                if (implicitColValues.containsKey(columnName)) {
                    TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                    Statistics stat = new BinaryStatistics();
                    stat.setNumNulls(0);
                    byte[] val = implicitColValues.get(columnName).getBytes();
                    stat.setMinMaxFromBytes(val, val);
                    statMap.put(path, new ColumnStatistics(stat, type));
                }
            }
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
        }

        return statMap;
    }

    private static TypeProtos.DataMode getDataMode(ColumnDescriptor column) {
        if (column.getMaxRepetitionLevel() > 0) {
            return TypeProtos.DataMode.REPEATED;
        } else if (column.getMaxDefinitionLevel() == 0) {
            return TypeProtos.DataMode.REQUIRED;
        } else {
            return TypeProtos.DataMode.OPTIONAL;
        }
    }

    public static Statistics convertDateStatIfNecessary(Statistics stat,
            ParquetReaderUtility.DateCorruptionStatus containsCorruptDates) {
        IntStatistics dateStat = (IntStatistics) stat;
        LongStatistics dateMLS = new LongStatistics();

        boolean isDateCorrect = containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;

        // Only do conversion when stat is NOT empty.
        if (!dateStat.isEmpty()) {
            dateMLS.setMinMax(convertToDrillDateValue(dateStat.getMin(), isDateCorrect),
                    convertToDrillDateValue(dateStat.getMax(), isDateCorrect));
            dateMLS.setNumNulls(dateStat.getNumNulls());
        }

        return dateMLS;

    }

    private static long convertToDrillDateValue(int dateValue, boolean isDateCorrect) {
        // See DRILL-4203 for the background regarding date type corruption issue in Drill CTAS prior to 1.9.0 release.
        if (isDateCorrect) {
            return dateValue * (long) DateTimeConstants.MILLIS_PER_DAY;
        } else {
            return (dateValue - ParquetReaderUtility.CORRECT_CORRUPT_DATE_SHIFT) * DateTimeConstants.MILLIS_PER_DAY;
        }
    }

}