org.apache.hadoop.hive.serde2.teradata.TeradataBinaryDataOutputStream.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.serde2.teradata.TeradataBinaryDataOutputStream.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.teradata;

import org.apache.commons.io.EndianUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;

import java.io.IOException;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Collections;

import static java.lang.String.join;
import static java.lang.String.format;

/**
 * The TeradataBinaryDataOutputStream is used to produce the output in compliance with the Teradata binary format,
 * so the output can be directly used to load into Teradata DB using TPT fastload.
 * Since the TD binary format uses little-endian to handle the SHORT, INT, LONG, DOUBLE and etc.
 * while the Hadoop uses big-endian,
 * We extend SwappedDataInputStream to return qualified bytes for these types and extend to handle the Teradata
 * specific types like VARCHAR, CHAR, TIMESTAMP, DATE...
 */
public class TeradataBinaryDataOutputStream extends ByteArrayOutputStream {

    private static final Log LOG = LogFactory.getLog(TeradataBinaryDataOutputStream.class);

    private static final int TIMESTAMP_NO_NANOS_BYTE_NUM = 19;

    public TeradataBinaryDataOutputStream() {
    }

    /**
     * Write VARCHAR(N).
     * The representation of Varchar in Teradata binary format is:
     * the first two bytes represent the length N of this varchar field,
     * the next N bytes represent the content of this varchar field.
     * To pad the null varchar, the length will be 0 and the content will be none.
     *
     * @param writable the writable
     * @throws IOException the io exception
     */
    public void writeVarChar(HiveVarcharWritable writable) throws IOException {
        if (writable == null) {
            EndianUtils.writeSwappedShort(this, (short) 0);
            return;
        }
        Text t = writable.getTextValue();
        int varcharLength = t.getLength();
        EndianUtils.writeSwappedShort(this, (short) varcharLength); // write the varchar length
        write(t.getBytes(), 0, varcharLength); // write the varchar content
    }

    /**
     * Write INT.
     * using little-endian to write integer.
     *
     * @param i the
     * @throws IOException the io exception
     */
    public void writeInt(int i) throws IOException {
        EndianUtils.writeSwappedInteger(this, i);
    }

    /**
     * Write TIMESTAMP(N).
     * The representation of timestamp in Teradata binary format is:
     * the byte number to read is based on the precision of timestamp,
     * each byte represents one char and the timestamp is using string representation,
     * eg: for 1911-11-11 19:20:21.433200 in TIMESTAMP(3), we will cut it to be 1911-11-11 19:20:21.433 and write
     * 31 39  31 31 2d 31 31 2d 31 31 20 31 39 3a 32 30 3a 32 31 2e 34 33 33.
     * the null timestamp will use space to pad.
     *
     * @param timestamp the timestamp
     * @param byteNum the byte number the timestamp will write
     * @throws IOException the io exception
     */
    public void writeTimestamp(TimestampWritableV2 timestamp, int byteNum) throws IOException {
        if (timestamp == null) {
            String pad = join("", Collections.nCopies(byteNum, " "));
            write(pad.getBytes("UTF8"));
            return;
        }
        String sTimeStamp = timestamp.getTimestamp().toString();
        if (sTimeStamp.length() >= byteNum) {
            write(sTimeStamp.substring(0, byteNum).getBytes("UTF8"));
            return;
        }
        write(sTimeStamp.getBytes("UTF8"));
        String pad;
        if (sTimeStamp.length() == TIMESTAMP_NO_NANOS_BYTE_NUM) {
            pad = "." + join("", Collections.nCopies(byteNum - sTimeStamp.length() - 1, "0"));
        } else {
            pad = join("", Collections.nCopies(byteNum - sTimeStamp.length(), "0"));
        }
        write(pad.getBytes("UTF8"));
    }

    /**
     * Write DOUBLE.
     * using little-endian to write double.
     *
     * @param d the d
     * @throws IOException the io exception
     */
    public void writeDouble(double d) throws IOException {
        EndianUtils.writeSwappedDouble(this, d);
    }

    /**
     * Write DATE.
     * The representation of date in Teradata binary format is:
     * The Date D is a int with 4 bytes using little endian.
     * The representation is (YYYYMMDD - 19000000).toInt -> D
     * eg. 1911.11.11 -> 19111111 -> 111111 -> 07 b2 01 00 in little endian.
     * the null date will use 0 to pad.
     *
     * @param date the date
     * @throws IOException the io exception
     */
    public void writeDate(DateWritableV2 date) throws IOException {
        if (date == null) {
            EndianUtils.writeSwappedInteger(this, 0);
            return;
        }
        int toWrite = date.get().getYear() * 10000 + date.get().getMonth() * 100 + date.get().getDay() - 19000000;
        EndianUtils.writeSwappedInteger(this, toWrite);
    }

    /**
     * Write LONG.
     * using little-endian to write double.
     *
     * @param l the l
     * @throws IOException the io exception
     */
    public void writeLong(long l) throws IOException {
        EndianUtils.writeSwappedLong(this, l);
    }

    /**
     * Write CHAR(N).
     * The representation of char in Teradata binary format is:
     * the byte number to read is based on the [charLength] * [bytePerChar] <- totalLength,
     * bytePerChar is decided by the charset: LATAIN charset is 2 bytes per char and UNICODE charset is 3 bytes per char.
     * the null char will use space to pad.
     *
     * @param writable the writable
     * @param length the byte n
     * @throws IOException the io exception
     */
    public void writeChar(HiveCharWritable writable, int length) throws IOException {
        if (writable == null) {
            String pad = join("", Collections.nCopies(length, " "));
            write(pad.getBytes("UTF8"));
            return;
        }
        Text t = writable.getStrippedValue();
        int contentLength = t.getLength();
        write(t.getBytes(), 0, contentLength);
        if (length - contentLength < 0) {
            throw new IOException(format(
                    "The byte num %s of HiveCharWritable is more than the byte num %s we can hold. "
                            + "The content of HiveCharWritable is %s",
                    contentLength, length, writable.getPaddedValue()));
        }
        if (length > contentLength) {
            String pad = join("", Collections.nCopies(length - contentLength, " "));
            write(pad.getBytes("UTF8"));
        }
    }

    /**
     * Write DECIMAL(P, S).
     * The representation of decimal in Teradata binary format is:
     * the byte number to read is decided solely by the precision(P),
     * HiveDecimal is constructed through the byte array and scale.
     * the rest of byte will use 0x00 to pad (positive) and use 0xFF to pad (negative).
     * the null DECIMAL will use 0x00 to pad.
     *
     * @param writable the writable
     * @param byteNum the byte num
     * @throws IOException the io exception
     */
    public void writeDecimal(HiveDecimalWritable writable, int byteNum, int scale) throws IOException {
        if (writable == null) {
            byte[] pad = new byte[byteNum];
            write(pad);
            return;
        }
        // since the HiveDecimal will auto adjust the scale to save resource
        // we need to adjust it back otherwise the output bytes will be wrong
        int hiveScale = writable.getHiveDecimal().scale();
        BigInteger bigInteger = writable.getHiveDecimal().unscaledValue();
        if (hiveScale < scale) {
            BigInteger multiplicand = new BigInteger("1" + join("", Collections.nCopies(scale - hiveScale, "0")));
            bigInteger = bigInteger.multiply(multiplicand);
        }
        byte[] content = bigInteger.toByteArray();
        int signBit = content[0] >> 7 & 1;
        ArrayUtils.reverse(content);
        write(content);
        if (byteNum > content.length) {
            byte[] pad;
            if (signBit == 0) {
                pad = new byte[byteNum - content.length];
            } else {
                pad = new byte[byteNum - content.length];
                Arrays.fill(pad, (byte) 255);
            }
            write(pad);
        }
    }

    /**
     * Write SHORT.
     * using little-endian to write short.
     *
     * @param s the s
     * @throws IOException the io exception
     */
    public void writeShort(short s) throws IOException {
        EndianUtils.writeSwappedShort(this, s);
    }

    /**
     * Write VARBYTE(N).
     * The representation of VARBYTE in Teradata binary format is:
     * the first two bytes represent the length N of this varchar field,
     * the next N bytes represent the content of this varchar field.
     * To pad the null varbyte, the length will be 0 and the content will be none.
     *
     * @param writable the writable
     * @throws IOException the io exception
     */
    public void writeVarByte(BytesWritable writable) throws IOException {
        if (writable == null) {
            EndianUtils.writeSwappedShort(this, (short) 0);
            return;
        }
        int varbyteLength = writable.getLength();
        EndianUtils.writeSwappedShort(this, (short) varbyteLength); // write the varbyte length
        write(writable.getBytes(), 0, varbyteLength); // write the varchar content
    }
}