org.apache.kylin.engine.mr.steps.FactDistinctColumnsMapper.java Source code

Introduction

Here is the source code for org.apache.kylin.engine.mr.steps.FactDistinctColumnsMapper.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package org.apache.kylin.engine.mr.steps;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.List;

import org.apache.hadoop.io.Text;
import org.apache.kylin.common.KylinVersion;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.StringUtil;
import org.apache.kylin.cube.cuboid.CuboidScheduler;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.measure.hllc.RegisterType;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

/**
 */
public class FactDistinctColumnsMapper<KEYIN> extends FactDistinctColumnsMapperBase<KEYIN, Object> {

    private static final Logger logger = LoggerFactory.getLogger(FactDistinctColumnsMapper.class);

    public static enum RawDataCounter {
        BYTES
    }

    protected boolean collectStatistics = false;
    protected CuboidScheduler cuboidScheduler = null;
    protected int nRowKey;
    private Integer[][] allCuboidsBitSet = null;
    private HLLCounter[] allCuboidsHLL = null;
    private Long[] cuboidIds;
    private HashFunction hf = null;
    private int rowCount = 0;
    private int samplingPercentage;
    //private ByteArray[] row_hashcodes = null;
    private long[] rowHashCodesLong = null;
    private ByteArray[] row_hashcodes = null;
    private ByteBuffer tmpbuf;
    private static final Text EMPTY_TEXT = new Text();
    public static final byte MARK_FOR_PARTITION_COL = (byte) 0xFE;
    public static final byte MARK_FOR_HLL = (byte) 0xFF;

    private int partitionColumnIndex = -1;
    private boolean needFetchPartitionCol = true;

    private SelfDefineSortableKey sortableKey = new SelfDefineSortableKey();

    //about details of the new algorithm, please see KYLIN-2518
    private boolean isUsePutRowKeyToHllNewAlgorithm;

    @Override
    protected void setup(Context context) throws IOException {
        super.setup(context);
        tmpbuf = ByteBuffer.allocate(4096);
        collectStatistics = Boolean
                .parseBoolean(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_ENABLED));
        if (collectStatistics) {
            samplingPercentage = Integer
                    .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
            cuboidScheduler = new CuboidScheduler(cubeDesc);
            nRowKey = cubeDesc.getRowkey().getRowKeyColumns().length;

            List<Long> cuboidIdList = Lists.newArrayList();
            List<Integer[]> allCuboidsBitSetList = Lists.newArrayList();
            addCuboidBitSet(baseCuboidId, allCuboidsBitSetList, cuboidIdList);

            allCuboidsBitSet = allCuboidsBitSetList.toArray(new Integer[cuboidIdList.size()][]);
            cuboidIds = cuboidIdList.toArray(new Long[cuboidIdList.size()]);

            allCuboidsHLL = new HLLCounter[cuboidIds.length];
            for (int i = 0; i < cuboidIds.length; i++) {
                allCuboidsHLL[i] = new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision(),
                        RegisterType.DENSE);
            }

            TblColRef partitionColRef = cubeDesc.getModel().getPartitionDesc().getPartitionDateColumnRef();
            if (partitionColRef != null) {
                partitionColumnIndex = intermediateTableDesc.getColumnIndex(partitionColRef);
            }

            // check whether need fetch the partition col values
            if (partitionColumnIndex < 0) {
                // if partition col not on cube, no need
                needFetchPartitionCol = false;
            } else {
                needFetchPartitionCol = true;
            }
            //for KYLIN-2518 backward compatibility
            if (KylinVersion.isBefore200(cubeDesc.getVersion())) {
                isUsePutRowKeyToHllNewAlgorithm = false;
                row_hashcodes = new ByteArray[nRowKey];
                for (int i = 0; i < nRowKey; i++) {
                    row_hashcodes[i] = new ByteArray();
                }
                hf = Hashing.murmur3_32();
                logger.info("Found KylinVersion : {}. Use old algorithm for cuboid sampling.",
                        cubeDesc.getVersion());
            } else {
                isUsePutRowKeyToHllNewAlgorithm = true;
                rowHashCodesLong = new long[nRowKey];
                hf = Hashing.murmur3_128();
                logger.info(
                        "Found KylinVersion : {}. Use new algorithm for cuboid sampling. About the details of the new algorithm, please refer to KYLIN-2518",
                        cubeDesc.getVersion());
            }
        }

    }

    private void addCuboidBitSet(long cuboidId, List<Integer[]> allCuboidsBitSet, List<Long> allCuboids) {
        allCuboids.add(cuboidId);
        Integer[] indice = new Integer[Long.bitCount(cuboidId)];

        long mask = Long.highestOneBit(baseCuboidId);
        int position = 0;
        for (int i = 0; i < nRowKey; i++) {
            if ((mask & cuboidId) > 0) {
                indice[position] = i;
                position++;
            }
            mask = mask >> 1;
        }

        allCuboidsBitSet.add(indice);
        Collection<Long> children = cuboidScheduler.getSpanningCuboid(cuboidId);
        for (Long childId : children) {
            addCuboidBitSet(childId, allCuboidsBitSet, allCuboids);
        }
    }

    @Override
    public void doMap(KEYIN key, Object record, Context context) throws IOException, InterruptedException {
        Collection<String[]> rowCollection = flatTableInputFormat.parseMapperInput(record);

        for (String[] row : rowCollection) {
            context.getCounter(RawDataCounter.BYTES).increment(countSizeInBytes(row));
            for (int i = 0; i < factDictCols.size(); i++) {
                String fieldValue = row[dictionaryColumnIndex[i]];
                if (fieldValue == null)
                    continue;

                int reducerIndex;
                if (uhcIndex[i] == 0) {
                    //for the normal dictionary column
                    reducerIndex = columnIndexToReducerBeginId.get(i);
                } else {
                    //for the uhc
                    reducerIndex = columnIndexToReducerBeginId.get(i)
                            + (fieldValue.hashCode() & 0x7fffffff) % uhcReducerCount;
                }

                tmpbuf.clear();
                byte[] valueBytes = Bytes.toBytes(fieldValue);
                int size = valueBytes.length + 1;
                if (size >= tmpbuf.capacity()) {
                    tmpbuf = ByteBuffer.allocate(countNewSize(tmpbuf.capacity(), size));
                }
                tmpbuf.put(Bytes.toBytes(reducerIndex)[3]);
                tmpbuf.put(valueBytes);
                outputKey.set(tmpbuf.array(), 0, tmpbuf.position());
                DataType type = factDictCols.get(i).getType();
                sortableKey.init(outputKey, type);
                //judge type
                context.write(sortableKey, EMPTY_TEXT);

                // log a few rows for troubleshooting
                if (rowCount < 10) {
                    logger.info("Sample output: " + factDictCols.get(i) + " '" + fieldValue + "' => reducer "
                            + reducerIndex);
                }
            }

            if (collectStatistics) {
                if (rowCount % 100 < samplingPercentage) {
                    if (isUsePutRowKeyToHllNewAlgorithm) {
                        putRowKeyToHLLNew(row);
                    } else {
                        putRowKeyToHLLOld(row);
                    }
                }

                if (needFetchPartitionCol == true) {
                    String fieldValue = row[partitionColumnIndex];
                    if (fieldValue != null) {
                        tmpbuf.clear();
                        byte[] valueBytes = Bytes.toBytes(fieldValue);
                        int size = valueBytes.length + 1;
                        if (size >= tmpbuf.capacity()) {
                            tmpbuf = ByteBuffer.allocate(countNewSize(tmpbuf.capacity(), size));
                        }
                        tmpbuf.put(MARK_FOR_PARTITION_COL);
                        tmpbuf.put(valueBytes);
                        outputKey.set(tmpbuf.array(), 0, tmpbuf.position());
                        sortableKey.init(outputKey, (byte) 0);
                        context.write(sortableKey, EMPTY_TEXT);
                    }
                }
            }
            rowCount++;
        }
    }

    private long countSizeInBytes(String[] row) {
        int size = 0;
        for (String s : row) {
            size += s == null ? 1 : StringUtil.utf8Length(s);
            size++; // delimiter
        }
        return size;
    }

    private void putRowKeyToHLLOld(String[] row) {
        //generate hash for each row key column
        for (int i = 0; i < nRowKey; i++) {
            Hasher hc = hf.newHasher();
            String colValue = row[intermediateTableDesc.getRowKeyColumnIndexes()[i]];
            if (colValue != null) {
                row_hashcodes[i].set(hc.putString(colValue).hash().asBytes());
            } else {
                row_hashcodes[i].set(hc.putInt(0).hash().asBytes());
            }
        }

        // user the row key column hash to get a consolidated hash for each cuboid
        for (int i = 0, n = allCuboidsBitSet.length; i < n; i++) {
            Hasher hc = hf.newHasher();
            for (int position = 0; position < allCuboidsBitSet[i].length; position++) {
                hc.putBytes(row_hashcodes[allCuboidsBitSet[i][position]].array());
            }

            allCuboidsHLL[i].add(hc.hash().asBytes());
        }
    }

    private void putRowKeyToHLLNew(String[] row) {
        //generate hash for each row key column
        for (int i = 0; i < nRowKey; i++) {
            Hasher hc = hf.newHasher();
            String colValue = row[intermediateTableDesc.getRowKeyColumnIndexes()[i]];
            if (colValue == null)
                colValue = "0";
            byte[] bytes = hc.putString(colValue).hash().asBytes();
            rowHashCodesLong[i] = (Bytes.toLong(bytes) + i);//add column ordinal to the hash value to distinguish between (a,b) and (b,a)
        }

        // user the row key column hash to get a consolidated hash for each cuboid
        for (int i = 0, n = allCuboidsBitSet.length; i < n; i++) {
            long value = 0;
            for (int position = 0; position < allCuboidsBitSet[i].length; position++) {
                value += rowHashCodesLong[allCuboidsBitSet[i][position]];
            }
            allCuboidsHLL[i].addHashDirectly(value);
        }
    }

    @Override
    protected void doCleanup(Context context) throws IOException, InterruptedException {
        if (collectStatistics) {
            ByteBuffer hllBuf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
            // output each cuboid's hll to reducer, key is 0 - cuboidId
            HLLCounter hll;
            for (int i = 0; i < cuboidIds.length; i++) {
                hll = allCuboidsHLL[i];

                tmpbuf.clear();
                tmpbuf.put(MARK_FOR_HLL); // one byte
                tmpbuf.putLong(cuboidIds[i]);
                outputKey.set(tmpbuf.array(), 0, tmpbuf.position());
                hllBuf.clear();
                hll.writeRegisters(hllBuf);
                outputValue.set(hllBuf.array(), 0, hllBuf.position());
                sortableKey.init(outputKey, (byte) 0);
                context.write(sortableKey, outputValue);
            }
        }
    }

    private int countNewSize(int oldSize, int dataSize) {
        int newSize = oldSize * 2;
        while (newSize < dataSize) {
            newSize = newSize * 2;
        }
        return newSize;
    }

}