org.apache.hawq.pxf.plugins.hive.HiveResolver.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hawq.pxf.plugins.hive.HiveResolver.java

Source

package org.apache.hawq.pxf.plugins.hive;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.hawq.pxf.api.*;
import org.apache.hawq.pxf.api.io.DataType;
import org.apache.hawq.pxf.api.utilities.InputData;
import org.apache.hawq.pxf.api.utilities.Plugin;
import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities;
import org.apache.hawq.pxf.service.utilities.Utilities;

import org.apache.commons.lang.CharUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.*;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.*;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import static org.apache.hawq.pxf.api.io.DataType.*;

/**
 * Class HiveResolver handles deserialization of records that were serialized
 * using Hadoop's Hive serialization framework.
 */
/*
 * TODO - remove SupressWarning once Hive resolves the problem described below
 * This line and the change of the deserialiazer member to Object instead of the
 * original Deserializer...., All this changes stem from the same issue. In
 * 0.11.0 The API changed and all Serde types extend a new interface -
 * AbstractSerde. But this change was not adopted by the OrcSerde (which was
 * also introduced in Hive 0.11.0). In order to cope with this inconsistency...
 * this bit of juggling has been necessary.
 */
@SuppressWarnings("deprecation")
public class HiveResolver extends Plugin implements ReadResolver {
    private static final Log LOG = LogFactory.getLog(HiveResolver.class);
    private static final String MAPKEY_DELIM = ":";
    private static final String COLLECTION_DELIM = ",";
    private SerDe deserializer;
    private List<OneField> partitionFields;
    private String serdeName;
    private String propsString;
    private String collectionDelim;
    private String mapkeyDelim;
    String partitionKeys;
    char delimiter;
    String nullChar = "\\N";
    private Configuration conf;
    private String hiveDefaultPartName;

    /**
     * Constructs the HiveResolver by parsing the userdata in the input and
     * obtaining the serde class name, the serde properties string and the
     * partition keys.
     *
     * @param input contains the Serde class name, the serde properties string
     *            and the partition keys
     * @throws Exception if user data was wrong or serde failed to be
     *             instantiated
     */
    public HiveResolver(InputData input) throws Exception {
        super(input);

        conf = new Configuration();
        hiveDefaultPartName = HiveConf.getVar(conf, HiveConf.ConfVars.DEFAULTPARTITIONNAME);
        LOG.debug("Hive's default partition name is " + hiveDefaultPartName);

        parseUserData(input);
        initPartitionFields();
        initSerde(input);
    }

    @Override
    public List<OneField> getFields(OneRow onerow) throws Exception {
        Object tuple = deserializer.deserialize((Writable) onerow.getData());
        // Each Hive record is a Struct
        StructObjectInspector soi = (StructObjectInspector) deserializer.getObjectInspector();
        List<OneField> record = traverseStruct(tuple, soi, false);
        /*
         * We follow Hive convention. Partition fields are always added at the
         * end of the record
         */
        record.addAll(partitionFields);

        return record;
    }

    /* Parses user data string (arrived from fragmenter). */
    void parseUserData(InputData input) throws Exception {
        final int EXPECTED_NUM_OF_TOKS = 5;

        String userData = new String(input.getFragmentUserData());
        String[] toks = userData.split(HiveDataFragmenter.HIVE_UD_DELIM);

        if (toks.length != EXPECTED_NUM_OF_TOKS) {
            throw new UserDataException(
                    "HiveResolver expected " + EXPECTED_NUM_OF_TOKS + " tokens, but got " + toks.length);
        }

        serdeName = toks[1];
        propsString = toks[2];
        partitionKeys = toks[3];

        collectionDelim = input.getUserProperty("COLLECTION_DELIM") == null ? COLLECTION_DELIM
                : input.getUserProperty("COLLECTION_DELIM");
        mapkeyDelim = input.getUserProperty("MAPKEY_DELIM") == null ? MAPKEY_DELIM
                : input.getUserProperty("MAPKEY_DELIM");
    }

    /*
     * Gets and init the deserializer for the records of this Hive data
     * fragment.
     */
    void initSerde(InputData inputData) throws Exception {
        Properties serdeProperties;

        Class<?> c = Class.forName(serdeName, true, JavaUtils.getClassLoader());
        deserializer = (SerDe) c.newInstance();
        serdeProperties = new Properties();
        ByteArrayInputStream inStream = new ByteArrayInputStream(propsString.getBytes());
        serdeProperties.load(inStream);
        deserializer.initialize(new JobConf(conf, HiveResolver.class), serdeProperties);
    }

    /*
     * The partition fields are initialized one time base on userData provided
     * by the fragmenter.
     */
    void initPartitionFields() {
        partitionFields = new LinkedList<>();
        if (partitionKeys.equals(HiveDataFragmenter.HIVE_NO_PART_TBL)) {
            return;
        }

        String[] partitionLevels = partitionKeys.split(HiveDataFragmenter.HIVE_PARTITIONS_DELIM);
        for (String partLevel : partitionLevels) {
            String[] levelKey = partLevel.split(HiveDataFragmenter.HIVE_1_PART_DELIM);
            String type = levelKey[1];
            String val = levelKey[2];
            DataType convertedType;
            Object convertedValue = null;
            boolean isDefaultPartition = false;

            LOG.debug("Partition type: " + type + ", value: " + val);
            // check if value is default partition
            isDefaultPartition = isDefaultPartition(type, val);
            // ignore the type's parameters
            String typeName = type.replaceAll("\\(.*\\)", "");

            switch (typeName) {
            case serdeConstants.STRING_TYPE_NAME:
                convertedType = TEXT;
                convertedValue = isDefaultPartition ? null : val;
                break;
            case serdeConstants.BOOLEAN_TYPE_NAME:
                convertedType = BOOLEAN;
                convertedValue = isDefaultPartition ? null : Boolean.valueOf(val);
                break;
            case serdeConstants.TINYINT_TYPE_NAME:
            case serdeConstants.SMALLINT_TYPE_NAME:
                convertedType = SMALLINT;
                convertedValue = isDefaultPartition ? null : Short.parseShort(val);
                break;
            case serdeConstants.INT_TYPE_NAME:
                convertedType = INTEGER;
                convertedValue = isDefaultPartition ? null : Integer.parseInt(val);
                break;
            case serdeConstants.BIGINT_TYPE_NAME:
                convertedType = BIGINT;
                convertedValue = isDefaultPartition ? null : Long.parseLong(val);
                break;
            case serdeConstants.FLOAT_TYPE_NAME:
                convertedType = REAL;
                convertedValue = isDefaultPartition ? null : Float.parseFloat(val);
                break;
            case serdeConstants.DOUBLE_TYPE_NAME:
                convertedType = FLOAT8;
                convertedValue = isDefaultPartition ? null : Double.parseDouble(val);
                break;
            case serdeConstants.TIMESTAMP_TYPE_NAME:
                convertedType = TIMESTAMP;
                convertedValue = isDefaultPartition ? null : Timestamp.valueOf(val);
                break;
            case serdeConstants.DATE_TYPE_NAME:
                convertedType = DATE;
                convertedValue = isDefaultPartition ? null : Date.valueOf(val);
                break;
            case serdeConstants.DECIMAL_TYPE_NAME:
                convertedType = NUMERIC;
                convertedValue = isDefaultPartition ? null : HiveDecimal.create(val).bigDecimalValue().toString();
                break;
            case serdeConstants.VARCHAR_TYPE_NAME:
                convertedType = VARCHAR;
                convertedValue = isDefaultPartition ? null : val;
                break;
            case serdeConstants.CHAR_TYPE_NAME:
                convertedType = BPCHAR;
                convertedValue = isDefaultPartition ? null : val;
                break;
            case serdeConstants.BINARY_TYPE_NAME:
                convertedType = BYTEA;
                convertedValue = isDefaultPartition ? null : val.getBytes();
                break;
            default:
                throw new UnsupportedTypeException("Unsupported partition type: " + type);
            }
            addOneFieldToRecord(partitionFields, convertedType, convertedValue);
        }
    }

    /*
     * The partition fields are initialized one time based on userData provided
     * by the fragmenter.
     */
    int initPartitionFields(StringBuilder parts) {
        if (partitionKeys.equals(HiveDataFragmenter.HIVE_NO_PART_TBL)) {
            return 0;
        }
        String[] partitionLevels = partitionKeys.split(HiveDataFragmenter.HIVE_PARTITIONS_DELIM);
        for (String partLevel : partitionLevels) {
            String[] levelKey = partLevel.split(HiveDataFragmenter.HIVE_1_PART_DELIM);
            String type = levelKey[1];
            String val = levelKey[2];
            parts.append(delimiter);

            if (isDefaultPartition(type, val)) {
                parts.append(nullChar);
            } else {
                // ignore the type's parameters
                String typeName = type.replaceAll("\\(.*\\)", "");
                switch (typeName) {
                case serdeConstants.STRING_TYPE_NAME:
                case serdeConstants.VARCHAR_TYPE_NAME:
                case serdeConstants.CHAR_TYPE_NAME:
                    parts.append(val);
                    break;
                case serdeConstants.BOOLEAN_TYPE_NAME:
                    parts.append(Boolean.parseBoolean(val));
                    break;
                case serdeConstants.TINYINT_TYPE_NAME:
                case serdeConstants.SMALLINT_TYPE_NAME:
                    parts.append(Short.parseShort(val));
                    break;
                case serdeConstants.INT_TYPE_NAME:
                    parts.append(Integer.parseInt(val));
                    break;
                case serdeConstants.BIGINT_TYPE_NAME:
                    parts.append(Long.parseLong(val));
                    break;
                case serdeConstants.FLOAT_TYPE_NAME:
                    parts.append(Float.parseFloat(val));
                    break;
                case serdeConstants.DOUBLE_TYPE_NAME:
                    parts.append(Double.parseDouble(val));
                    break;
                case serdeConstants.TIMESTAMP_TYPE_NAME:
                    parts.append(Timestamp.valueOf(val));
                    break;
                case serdeConstants.DATE_TYPE_NAME:
                    parts.append(Date.valueOf(val));
                    break;
                case serdeConstants.DECIMAL_TYPE_NAME:
                    parts.append(HiveDecimal.create(val).bigDecimalValue());
                    break;
                case serdeConstants.BINARY_TYPE_NAME:
                    Utilities.byteArrayToOctalString(val.getBytes(), parts);
                    break;
                default:
                    throw new UnsupportedTypeException("Unsupported partition type: " + type);
                }
            }
        }
        return partitionLevels.length;
    }

    /**
     * Returns true if the partition value is Hive's default partition name
     * (defined in hive.exec.default.partition.name).
     *
     * @param partitionType partition field type
     * @param partitionValue partition value
     * @return true if the partition value is Hive's default partition
     */
    private boolean isDefaultPartition(String partitionType, String partitionValue) {
        boolean isDefaultPartition = false;
        if (hiveDefaultPartName.equals(partitionValue)) {
            LOG.debug("partition " + partitionType + " is hive default partition (value " + partitionValue
                    + "), converting field to NULL");
            isDefaultPartition = true;
        }
        return isDefaultPartition;
    }

    /*
     * If the object representing the whole record is null or if an object
     * representing a composite sub-object (map, list,..) is null - then
     * BadRecordException will be thrown. If a primitive field value is null,
     * then a null will appear for the field in the record in the query result.
     */
    private void traverseTuple(Object obj, ObjectInspector objInspector, List<OneField> record, boolean toFlatten)
            throws IOException, BadRecordException {
        ObjectInspector.Category category = objInspector.getCategory();
        if ((obj == null) && (category != ObjectInspector.Category.PRIMITIVE)) {
            throw new BadRecordException("NULL Hive composite object");
        }
        switch (category) {
        case PRIMITIVE:
            resolvePrimitive(obj, (PrimitiveObjectInspector) objInspector, record, toFlatten);
            break;
        case LIST:
            List<OneField> listRecord = traverseList(obj, (ListObjectInspector) objInspector);
            addOneFieldToRecord(record, TEXT,
                    String.format("[%s]", HdfsUtilities.toString(listRecord, collectionDelim)));
            break;
        case MAP:
            List<OneField> mapRecord = traverseMap(obj, (MapObjectInspector) objInspector);
            addOneFieldToRecord(record, TEXT,
                    String.format("{%s}", HdfsUtilities.toString(mapRecord, collectionDelim)));
            break;
        case STRUCT:
            List<OneField> structRecord = traverseStruct(obj, (StructObjectInspector) objInspector, true);
            addOneFieldToRecord(record, TEXT,
                    String.format("{%s}", HdfsUtilities.toString(structRecord, collectionDelim)));
            break;
        case UNION:
            List<OneField> unionRecord = traverseUnion(obj, (UnionObjectInspector) objInspector);
            addOneFieldToRecord(record, TEXT,
                    String.format("[%s]", HdfsUtilities.toString(unionRecord, collectionDelim)));
            break;
        default:
            throw new UnsupportedTypeException("Unknown category type: " + objInspector.getCategory());
        }
    }

    private List<OneField> traverseUnion(Object obj, UnionObjectInspector uoi)
            throws BadRecordException, IOException {
        List<OneField> unionRecord = new LinkedList<>();
        List<? extends ObjectInspector> ois = uoi.getObjectInspectors();
        if (ois == null) {
            throw new BadRecordException("Illegal value NULL for Hive data type Union");
        }
        traverseTuple(uoi.getField(obj), ois.get(uoi.getTag(obj)), unionRecord, true);
        return unionRecord;
    }

    private List<OneField> traverseList(Object obj, ListObjectInspector loi)
            throws BadRecordException, IOException {
        List<OneField> listRecord = new LinkedList<>();
        List<?> list = loi.getList(obj);
        ObjectInspector eoi = loi.getListElementObjectInspector();
        if (list == null) {
            throw new BadRecordException("Illegal value NULL for Hive data type List");
        }
        for (Object object : list) {
            traverseTuple(object, eoi, listRecord, true);
        }
        return listRecord;
    }

    private List<OneField> traverseStruct(Object struct, StructObjectInspector soi, boolean toFlatten)
            throws BadRecordException, IOException {
        List<? extends StructField> fields = soi.getAllStructFieldRefs();
        List<Object> structFields = soi.getStructFieldsDataAsList(struct);
        if (structFields == null) {
            throw new BadRecordException("Illegal value NULL for Hive data type Struct");
        }
        List<OneField> structRecord = new LinkedList<>();
        List<OneField> complexRecord = new LinkedList<>();
        for (int i = 0; i < structFields.size(); i++) {
            if (toFlatten) {
                complexRecord
                        .add(new OneField(TEXT.getOID(), String.format("\"%s\"", fields.get(i).getFieldName())));
            }
            traverseTuple(structFields.get(i), fields.get(i).getFieldObjectInspector(), complexRecord, toFlatten);
            if (toFlatten) {
                addOneFieldToRecord(structRecord, TEXT, HdfsUtilities.toString(complexRecord, mapkeyDelim));
                complexRecord.clear();
            }
        }
        return toFlatten ? structRecord : complexRecord;
    }

    private List<OneField> traverseMap(Object obj, MapObjectInspector moi) throws BadRecordException, IOException {
        List<OneField> complexRecord = new LinkedList<>();
        List<OneField> mapRecord = new LinkedList<>();
        ObjectInspector koi = moi.getMapKeyObjectInspector();
        ObjectInspector voi = moi.getMapValueObjectInspector();
        Map<?, ?> map = moi.getMap(obj);
        if (map == null) {
            throw new BadRecordException("Illegal value NULL for Hive data type Map");
        } else if (map.isEmpty()) {
            traverseTuple(null, koi, complexRecord, true);
            traverseTuple(null, voi, complexRecord, true);
            addOneFieldToRecord(mapRecord, TEXT, HdfsUtilities.toString(complexRecord, mapkeyDelim));
        } else {
            for (Map.Entry<?, ?> entry : map.entrySet()) {
                traverseTuple(entry.getKey(), koi, complexRecord, true);
                traverseTuple(entry.getValue(), voi, complexRecord, true);
                addOneFieldToRecord(mapRecord, TEXT, HdfsUtilities.toString(complexRecord, mapkeyDelim));
                complexRecord.clear();
            }
        }
        return mapRecord;
    }

    private void resolvePrimitive(Object o, PrimitiveObjectInspector oi, List<OneField> record, boolean toFlatten)
            throws IOException {
        Object val;
        switch (oi.getPrimitiveCategory()) {
        case BOOLEAN: {
            val = (o != null) ? ((BooleanObjectInspector) oi).get(o) : null;
            addOneFieldToRecord(record, BOOLEAN, val);
            break;
        }
        case SHORT: {
            val = (o != null) ? ((ShortObjectInspector) oi).get(o) : null;
            addOneFieldToRecord(record, SMALLINT, val);
            break;
        }
        case INT: {
            val = (o != null) ? ((IntObjectInspector) oi).get(o) : null;
            addOneFieldToRecord(record, INTEGER, val);
            break;
        }
        case LONG: {
            val = (o != null) ? ((LongObjectInspector) oi).get(o) : null;
            addOneFieldToRecord(record, BIGINT, val);
            break;
        }
        case FLOAT: {
            val = (o != null) ? ((FloatObjectInspector) oi).get(o) : null;
            addOneFieldToRecord(record, REAL, val);
            break;
        }
        case DOUBLE: {
            val = (o != null) ? ((DoubleObjectInspector) oi).get(o) : null;
            addOneFieldToRecord(record, FLOAT8, val);
            break;
        }
        case DECIMAL: {
            String sVal = null;
            if (o != null) {
                HiveDecimal hd = ((HiveDecimalObjectInspector) oi).getPrimitiveJavaObject(o);
                if (hd != null) {
                    BigDecimal bd = hd.bigDecimalValue();
                    sVal = bd.toString();
                }
            }
            addOneFieldToRecord(record, NUMERIC, sVal);
            break;
        }
        case STRING: {
            val = (o != null) ? ((StringObjectInspector) oi).getPrimitiveJavaObject(o) : null;
            addOneFieldToRecord(record, TEXT, toFlatten ? String.format("\"%s\"", val) : val);
            break;
        }
        case VARCHAR:
            val = (o != null) ? ((HiveVarcharObjectInspector) oi).getPrimitiveJavaObject(o) : null;
            addOneFieldToRecord(record, VARCHAR, toFlatten ? String.format("\"%s\"", val) : val);
            break;
        case CHAR:
            val = (o != null) ? ((HiveCharObjectInspector) oi).getPrimitiveJavaObject(o) : null;
            addOneFieldToRecord(record, BPCHAR, toFlatten ? String.format("\"%s\"", val) : val);
            break;
        case BINARY: {
            byte[] toEncode = null;
            if (o != null) {
                BytesWritable bw = ((BinaryObjectInspector) oi).getPrimitiveWritableObject(o);
                toEncode = new byte[bw.getLength()];
                System.arraycopy(bw.getBytes(), 0, toEncode, 0, bw.getLength());
            }
            addOneFieldToRecord(record, BYTEA, toEncode);
            break;
        }
        case TIMESTAMP: {
            val = (o != null) ? ((TimestampObjectInspector) oi).getPrimitiveJavaObject(o) : null;
            addOneFieldToRecord(record, TIMESTAMP, val);
            break;
        }
        case DATE:
            val = (o != null) ? ((DateObjectInspector) oi).getPrimitiveJavaObject(o) : null;
            addOneFieldToRecord(record, DATE, val);
            break;
        case BYTE: { /* TINYINT */
            val = (o != null) ? new Short(((ByteObjectInspector) oi).get(o)) : null;
            addOneFieldToRecord(record, SMALLINT, val);
            break;
        }
        default: {
            throw new UnsupportedTypeException(
                    oi.getTypeName() + " conversion is not supported by " + getClass().getSimpleName());
        }
        }
    }

    private void addOneFieldToRecord(List<OneField> record, DataType gpdbWritableType, Object val) {
        record.add(new OneField(gpdbWritableType.getOID(), val));
    }

    /*
     * Gets the delimiter character from the URL, verify and store it. Must be a
     * single ascii character (same restriction as Hawq's). If a hex
     * representation was passed, convert it to its char.
     */
    void parseDelimiterChar(InputData input) {

        String userDelim = input.getUserProperty("DELIMITER");

        if (userDelim == null) {
            throw new IllegalArgumentException("DELIMITER is a required option");
        }

        final int VALID_LENGTH = 1;
        final int VALID_LENGTH_HEX = 4;

        if (userDelim.startsWith("\\x")) { // hexadecimal sequence

            if (userDelim.length() != VALID_LENGTH_HEX) {
                throw new IllegalArgumentException("Invalid hexdecimal value for delimiter (got" + userDelim + ")");
            }

            delimiter = (char) Integer.parseInt(userDelim.substring(2, VALID_LENGTH_HEX), 16);

            if (!CharUtils.isAscii(delimiter)) {
                throw new IllegalArgumentException(
                        "Invalid delimiter value. Must be a single ASCII character, or a hexadecimal sequence (got non ASCII "
                                + delimiter + ")");
            }

            return;
        }

        if (userDelim.length() != VALID_LENGTH) {
            throw new IllegalArgumentException(
                    "Invalid delimiter value. Must be a single ASCII character, or a hexadecimal sequence (got "
                            + userDelim + ")");
        }

        if (!CharUtils.isAscii(userDelim.charAt(0))) {
            throw new IllegalArgumentException(
                    "Invalid delimiter value. Must be a single ASCII character, or a hexadecimal sequence (got non ASCII "
                            + userDelim + ")");
        }

        delimiter = userDelim.charAt(0);
    }
}