org.apache.carbondata.hadoop.stream.StreamRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.carbondata.hadoop.stream.StreamRecordReader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.carbondata.hadoop.stream;

import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.carbondata.core.cache.Cache;
import org.apache.carbondata.core.cache.CacheProvider;
import org.apache.carbondata.core.cache.CacheType;
import org.apache.carbondata.core.cache.dictionary.Dictionary;
import org.apache.carbondata.core.cache.dictionary.DictionaryColumnUniqueIdentifier;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.datastore.block.SegmentProperties;
import org.apache.carbondata.core.datastore.compression.CompressorFactory;
import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryGenerator;
import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryKeyGeneratorFactory;
import org.apache.carbondata.core.metadata.blocklet.index.BlockletMinMaxIndex;
import org.apache.carbondata.core.metadata.datatype.DataType;
import org.apache.carbondata.core.metadata.datatype.DataTypes;
import org.apache.carbondata.core.metadata.encoder.Encoding;
import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension;
import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure;
import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
import org.apache.carbondata.core.reader.CarbonHeaderReader;
import org.apache.carbondata.core.scan.expression.exception.FilterUnsupportedException;
import org.apache.carbondata.core.scan.filter.FilterUtil;
import org.apache.carbondata.core.scan.filter.GenericQueryType;
import org.apache.carbondata.core.scan.filter.executer.FilterExecuter;
import org.apache.carbondata.core.scan.filter.intf.RowImpl;
import org.apache.carbondata.core.scan.filter.intf.RowIntf;
import org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf;
import org.apache.carbondata.core.scan.model.QueryModel;
import org.apache.carbondata.core.util.CarbonMetadataUtil;
import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.core.util.DataTypeUtil;
import org.apache.carbondata.format.BlockletHeader;
import org.apache.carbondata.format.FileHeader;
import org.apache.carbondata.hadoop.CarbonInputSplit;
import org.apache.carbondata.hadoop.CarbonMultiBlockSplit;
import org.apache.carbondata.hadoop.api.CarbonTableInputFormat;
import org.apache.carbondata.processing.util.CarbonDataProcessorUtil;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/**
 * Stream row record reader
 */
public class StreamRecordReader extends RecordReader<Void, Object> {

    // metadata
    protected CarbonTable carbonTable;
    private CarbonColumn[] storageColumns;
    private boolean[] isRequired;
    private DataType[] measureDataTypes;
    private int dimensionCount;
    private int measureCount;

    // input
    private FileSplit fileSplit;
    private Configuration hadoopConf;
    protected StreamBlockletReader input;
    protected boolean isFirstRow = true;
    protected QueryModel model;

    // decode data
    private BitSet allNonNull;
    private boolean[] isNoDictColumn;
    private DirectDictionaryGenerator[] directDictionaryGenerators;
    private CacheProvider cacheProvider;
    private Cache<DictionaryColumnUniqueIdentifier, Dictionary> cache;
    private GenericQueryType[] queryTypes;
    private String compressorName;

    // vectorized reader
    protected boolean isFinished = false;

    // filter
    protected FilterExecuter filter;
    private boolean[] isFilterRequired;
    private Object[] filterValues;
    protected RowIntf filterRow;
    private int[] filterMap;

    // output
    protected CarbonColumn[] projection;
    private boolean[] isProjectionRequired;
    private int[] projectionMap;
    protected Object[] outputValues;

    // empty project, null filter
    protected boolean skipScanData;

    // return raw row for handoff
    private boolean useRawRow = false;

    public StreamRecordReader(QueryModel mdl, boolean useRawRow) {
        this.model = mdl;
        this.useRawRow = useRawRow;

    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
        // input
        if (split instanceof CarbonInputSplit) {
            fileSplit = (CarbonInputSplit) split;
        } else if (split instanceof CarbonMultiBlockSplit) {
            fileSplit = ((CarbonMultiBlockSplit) split).getAllSplits().get(0);
        } else {
            fileSplit = (FileSplit) split;
        }

        // metadata
        hadoopConf = context.getConfiguration();
        if (model == null) {
            CarbonTableInputFormat format = new CarbonTableInputFormat<Object>();
            model = format.createQueryModel(split, context);
        }
        carbonTable = model.getTable();
        List<CarbonDimension> dimensions = carbonTable.getDimensionByTableName(carbonTable.getTableName());
        dimensionCount = dimensions.size();
        List<CarbonMeasure> measures = carbonTable.getMeasureByTableName(carbonTable.getTableName());
        measureCount = measures.size();
        List<CarbonColumn> carbonColumnList = carbonTable.getStreamStorageOrderColumn(carbonTable.getTableName());
        storageColumns = carbonColumnList.toArray(new CarbonColumn[carbonColumnList.size()]);
        isNoDictColumn = CarbonDataProcessorUtil.getNoDictionaryMapping(storageColumns);
        directDictionaryGenerators = new DirectDictionaryGenerator[storageColumns.length];
        for (int i = 0; i < storageColumns.length; i++) {
            if (storageColumns[i].hasEncoding(Encoding.DIRECT_DICTIONARY)) {
                directDictionaryGenerators[i] = DirectDictionaryKeyGeneratorFactory
                        .getDirectDictionaryGenerator(storageColumns[i].getDataType());
            }
        }
        measureDataTypes = new DataType[measureCount];
        for (int i = 0; i < measureCount; i++) {
            measureDataTypes[i] = storageColumns[dimensionCount + i].getDataType();
        }

        // decode data
        allNonNull = new BitSet(storageColumns.length);
        projection = model.getProjectionColumns();

        isRequired = new boolean[storageColumns.length];
        boolean[] isFiltlerDimensions = model.getIsFilterDimensions();
        boolean[] isFiltlerMeasures = model.getIsFilterMeasures();
        isFilterRequired = new boolean[storageColumns.length];
        filterMap = new int[storageColumns.length];
        for (int i = 0; i < storageColumns.length; i++) {
            if (storageColumns[i].isDimension()) {
                if (isFiltlerDimensions[storageColumns[i].getOrdinal()]) {
                    isRequired[i] = true;
                    isFilterRequired[i] = true;
                    filterMap[i] = storageColumns[i].getOrdinal();
                }
            } else {
                if (isFiltlerMeasures[storageColumns[i].getOrdinal()]) {
                    isRequired[i] = true;
                    isFilterRequired[i] = true;
                    filterMap[i] = carbonTable.getDimensionOrdinalMax() + storageColumns[i].getOrdinal();
                }
            }
        }

        isProjectionRequired = new boolean[storageColumns.length];
        projectionMap = new int[storageColumns.length];
        for (int j = 0; j < projection.length; j++) {
            for (int i = 0; i < storageColumns.length; i++) {
                if (storageColumns[i].getColName().equals(projection[j].getColName())) {
                    isRequired[i] = true;
                    isProjectionRequired[i] = true;
                    projectionMap[i] = j;
                    break;
                }
            }
        }

        // initialize filter
        if (null != model.getFilterExpressionResolverTree()) {
            initializeFilter();
        } else if (projection.length == 0) {
            skipScanData = true;
        }

    }

    private void initializeFilter() {

        List<ColumnSchema> wrapperColumnSchemaList = CarbonUtil.getColumnSchemaList(
                carbonTable.getDimensionByTableName(carbonTable.getTableName()),
                carbonTable.getMeasureByTableName(carbonTable.getTableName()));
        int[] dimLensWithComplex = new int[wrapperColumnSchemaList.size()];
        for (int i = 0; i < dimLensWithComplex.length; i++) {
            dimLensWithComplex[i] = Integer.MAX_VALUE;
        }

        int[] dictionaryColumnCardinality = CarbonUtil.getFormattedCardinality(dimLensWithComplex,
                wrapperColumnSchemaList);
        SegmentProperties segmentProperties = new SegmentProperties(wrapperColumnSchemaList,
                dictionaryColumnCardinality);
        Map<Integer, GenericQueryType> complexDimensionInfoMap = new HashMap<>();

        FilterResolverIntf resolverIntf = model.getFilterExpressionResolverTree();
        filter = FilterUtil.getFilterExecuterTree(resolverIntf, segmentProperties, complexDimensionInfoMap);
        // for row filter, we need update column index
        FilterUtil.updateIndexOfColumnExpression(resolverIntf.getFilterExpression(),
                carbonTable.getDimensionOrdinalMax());

    }

    private byte[] getSyncMarker(String filePath) throws IOException {
        CarbonHeaderReader headerReader = new CarbonHeaderReader(filePath);
        FileHeader header = headerReader.readHeader();
        // legacy store does not have this member
        if (header.isSetCompressor_name()) {
            compressorName = header.getCompressor_name();
        } else {
            compressorName = CompressorFactory.NativeSupportedCompressor.SNAPPY.getName();
        }
        return header.getSync_marker();
    }

    protected void initializeAtFirstRow() throws IOException {
        filterValues = new Object[carbonTable.getDimensionOrdinalMax() + measureCount];
        filterRow = new RowImpl();
        filterRow.setValues(filterValues);

        outputValues = new Object[projection.length];

        Path file = fileSplit.getPath();

        byte[] syncMarker = getSyncMarker(file.toString());

        FileSystem fs = file.getFileSystem(hadoopConf);

        int bufferSize = Integer.parseInt(hadoopConf.get(CarbonStreamInputFormat.READ_BUFFER_SIZE,
                CarbonStreamInputFormat.READ_BUFFER_SIZE_DEFAULT));

        FSDataInputStream fileIn = fs.open(file, bufferSize);
        fileIn.seek(fileSplit.getStart());
        input = new StreamBlockletReader(syncMarker, fileIn, fileSplit.getLength(), fileSplit.getStart() == 0,
                compressorName);

        cacheProvider = CacheProvider.getInstance();
        cache = cacheProvider.createCache(CacheType.FORWARD_DICTIONARY);
        queryTypes = CarbonStreamInputFormat.getComplexDimensions(carbonTable, storageColumns, cache);
    }

    /**
     * check next Row
     */
    protected boolean nextRow() throws IOException {
        // read row one by one
        try {
            boolean hasNext;
            boolean scanMore = false;
            do {
                hasNext = input.hasNext();
                if (hasNext) {
                    if (skipScanData) {
                        input.nextRow();
                        scanMore = false;
                    } else {
                        if (useRawRow) {
                            // read raw row for streaming handoff which does not require decode raw row
                            readRawRowFromStream();
                        } else {
                            readRowFromStream();
                        }
                        if (null != filter) {
                            scanMore = !filter.applyFilter(filterRow, carbonTable.getDimensionOrdinalMax());
                        } else {
                            scanMore = false;
                        }
                    }
                } else {
                    if (input.nextBlocklet()) {
                        BlockletHeader header = input.readBlockletHeader();
                        if (isScanRequired(header)) {
                            if (skipScanData) {
                                input.skipBlockletData(false);
                            } else {
                                input.readBlockletData(header);
                            }
                        } else {
                            input.skipBlockletData(true);
                        }
                        scanMore = true;
                    } else {
                        isFinished = true;
                        scanMore = false;
                    }
                }
            } while (scanMore);
            return hasNext;
        } catch (FilterUnsupportedException e) {
            throw new IOException("Failed to filter row in detail reader", e);
        }
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (isFirstRow) {
            isFirstRow = false;
            initializeAtFirstRow();
        }
        if (isFinished) {
            return false;
        }

        return nextRow();
    }

    @Override
    public Void getCurrentKey() throws IOException, InterruptedException {
        return null;
    }

    @Override
    public Object getCurrentValue() throws IOException, InterruptedException {
        return outputValues;
    }

    protected boolean isScanRequired(BlockletHeader header) {
        if (filter != null && header.getBlocklet_index() != null) {
            BlockletMinMaxIndex minMaxIndex = CarbonMetadataUtil
                    .convertExternalMinMaxIndex(header.getBlocklet_index().getMin_max_index());
            if (minMaxIndex != null) {
                BitSet bitSet = filter.isScanRequired(minMaxIndex.getMaxValues(), minMaxIndex.getMinValues(),
                        minMaxIndex.getIsMinMaxSet());
                if (bitSet.isEmpty()) {
                    return false;
                } else {
                    return true;
                }
            }
        }
        return true;
    }

    protected void readRowFromStream() {
        input.nextRow();
        short nullLen = input.readShort();
        BitSet nullBitSet = allNonNull;
        if (nullLen > 0) {
            nullBitSet = BitSet.valueOf(input.readBytes(nullLen));
        }
        int colCount = 0;
        // primitive type dimension
        for (; colCount < isNoDictColumn.length; colCount++) {
            if (nullBitSet.get(colCount)) {
                if (isFilterRequired[colCount]) {
                    filterValues[filterMap[colCount]] = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
                }
                if (isProjectionRequired[colCount]) {
                    outputValues[projectionMap[colCount]] = null;
                }
            } else {
                if (isNoDictColumn[colCount]) {
                    int v = input.readShort();
                    if (isRequired[colCount]) {
                        byte[] b = input.readBytes(v);
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = b;
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = DataTypeUtil
                                    .getDataBasedOnDataTypeForNoDictionaryColumn(b,
                                            storageColumns[colCount].getDataType());
                        }
                    } else {
                        input.skipBytes(v);
                    }
                } else if (null != directDictionaryGenerators[colCount]) {
                    if (isRequired[colCount]) {
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = input.copy(4);
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = directDictionaryGenerators[colCount]
                                    .getValueFromSurrogate(input.readInt());
                        } else {
                            input.skipBytes(4);
                        }
                    } else {
                        input.skipBytes(4);
                    }
                } else {
                    if (isRequired[colCount]) {
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = input.copy(4);
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = input.readInt();
                        } else {
                            input.skipBytes(4);
                        }
                    } else {
                        input.skipBytes(4);
                    }
                }
            }
        }
        // complex type dimension
        for (; colCount < dimensionCount; colCount++) {
            if (nullBitSet.get(colCount)) {
                if (isFilterRequired[colCount]) {
                    filterValues[filterMap[colCount]] = null;
                }
                if (isProjectionRequired[colCount]) {
                    outputValues[projectionMap[colCount]] = null;
                }
            } else {
                short v = input.readShort();
                if (isRequired[colCount]) {
                    byte[] b = input.readBytes(v);
                    if (isFilterRequired[colCount]) {
                        filterValues[filterMap[colCount]] = b;
                    }
                    if (isProjectionRequired[colCount]) {
                        outputValues[projectionMap[colCount]] = queryTypes[colCount]
                                .getDataBasedOnDataType(ByteBuffer.wrap(b));
                    }
                } else {
                    input.skipBytes(v);
                }
            }
        }
        // measure
        DataType dataType;
        for (int msrCount = 0; msrCount < measureCount; msrCount++, colCount++) {
            if (nullBitSet.get(colCount)) {
                if (isFilterRequired[colCount]) {
                    filterValues[filterMap[colCount]] = null;
                }
                if (isProjectionRequired[colCount]) {
                    outputValues[projectionMap[colCount]] = null;
                }
            } else {
                dataType = measureDataTypes[msrCount];
                if (dataType == DataTypes.BOOLEAN) {
                    if (isRequired[colCount]) {
                        boolean v = input.readBoolean();
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = v;
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = v;
                        }
                    } else {
                        input.skipBytes(1);
                    }
                } else if (dataType == DataTypes.SHORT) {
                    if (isRequired[colCount]) {
                        short v = input.readShort();
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = v;
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = v;
                        }
                    } else {
                        input.skipBytes(2);
                    }
                } else if (dataType == DataTypes.INT) {
                    if (isRequired[colCount]) {
                        int v = input.readInt();
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = v;
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = v;
                        }
                    } else {
                        input.skipBytes(4);
                    }
                } else if (dataType == DataTypes.LONG) {
                    if (isRequired[colCount]) {
                        long v = input.readLong();
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = v;
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = v;
                        }
                    } else {
                        input.skipBytes(8);
                    }
                } else if (dataType == DataTypes.DOUBLE) {
                    if (isRequired[colCount]) {
                        double v = input.readDouble();
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = v;
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = v;
                        }
                    } else {
                        input.skipBytes(8);
                    }
                } else if (DataTypes.isDecimal(dataType)) {
                    int len = input.readShort();
                    if (isRequired[colCount]) {
                        BigDecimal v = DataTypeUtil.byteToBigDecimal(input.readBytes(len));
                        if (isFilterRequired[colCount]) {
                            filterValues[filterMap[colCount]] = v;
                        }
                        if (isProjectionRequired[colCount]) {
                            outputValues[projectionMap[colCount]] = DataTypeUtil.getDataTypeConverter()
                                    .convertFromBigDecimalToDecimal(v);
                        }
                    } else {
                        input.skipBytes(len);
                    }
                }
            }
        }
    }

    private void readRawRowFromStream() {
        input.nextRow();
        short nullLen = input.readShort();
        BitSet nullBitSet = allNonNull;
        if (nullLen > 0) {
            nullBitSet = BitSet.valueOf(input.readBytes(nullLen));
        }
        int colCount = 0;
        // primitive type dimension
        for (; colCount < isNoDictColumn.length; colCount++) {
            if (nullBitSet.get(colCount)) {
                outputValues[colCount] = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
            } else {
                if (isNoDictColumn[colCount]) {
                    int v = input.readShort();
                    outputValues[colCount] = input.readBytes(v);
                } else {
                    outputValues[colCount] = input.readInt();
                }
            }
        }
        // complex type dimension
        for (; colCount < dimensionCount; colCount++) {
            if (nullBitSet.get(colCount)) {
                outputValues[colCount] = null;
            } else {
                short v = input.readShort();
                outputValues[colCount] = input.readBytes(v);
            }
        }
        // measure
        DataType dataType;
        for (int msrCount = 0; msrCount < measureCount; msrCount++, colCount++) {
            if (nullBitSet.get(colCount)) {
                outputValues[colCount] = null;
            } else {
                dataType = measureDataTypes[msrCount];
                if (dataType == DataTypes.BOOLEAN) {
                    outputValues[colCount] = input.readBoolean();
                } else if (dataType == DataTypes.SHORT) {
                    outputValues[colCount] = input.readShort();
                } else if (dataType == DataTypes.INT) {
                    outputValues[colCount] = input.readInt();
                } else if (dataType == DataTypes.LONG) {
                    outputValues[colCount] = input.readLong();
                } else if (dataType == DataTypes.DOUBLE) {
                    outputValues[colCount] = input.readDouble();
                } else if (DataTypes.isDecimal(dataType)) {
                    int len = input.readShort();
                    outputValues[colCount] = DataTypeUtil.byteToBigDecimal(input.readBytes(len));
                }
            }
        }
    }

    @Override
    public float getProgress() {
        return 0;
    }

    @Override
    public void close() throws IOException {
        if (null != input) {
            input.close();
        }
    }
}