com.google.cloud.dataflow.sdk.coders.StringUtf8Coder.java Source code

Java tutorial

Introduction

Here is the source code for com.google.cloud.dataflow.sdk.coders.StringUtf8Coder.java

Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.coders;

import com.google.cloud.dataflow.sdk.util.ExposedByteArrayOutputStream;
import com.google.cloud.dataflow.sdk.util.StreamUtils;
import com.google.cloud.dataflow.sdk.util.VarInt;
import com.google.common.base.Utf8;
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;

import com.fasterxml.jackson.annotation.JsonCreator;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UTFDataFormatException;
import java.nio.charset.StandardCharsets;

/**
 * A {@link Coder} that encodes {@link String Strings} in UTF-8 encoding.
 * If in a nested context, prefixes the string with an integer length field,
 * encoded via a {@link VarIntCoder}.
 */
public class StringUtf8Coder extends AtomicCoder<String> {

    @JsonCreator
    public static StringUtf8Coder of() {
        return INSTANCE;
    }

    /////////////////////////////////////////////////////////////////////////////

    private static final StringUtf8Coder INSTANCE = new StringUtf8Coder();

    private static void writeString(String value, DataOutputStream dos) throws IOException {
        byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
        VarInt.encode(bytes.length, dos);
        dos.write(bytes);
    }

    private static String readString(DataInputStream dis) throws IOException {
        int len = VarInt.decodeInt(dis);
        if (len < 0) {
            throw new CoderException("Invalid encoded string length: " + len);
        }
        byte[] bytes = new byte[len];
        dis.readFully(bytes);
        return new String(bytes, StandardCharsets.UTF_8);
    }

    private StringUtf8Coder() {
    }

    @Override
    public void encode(String value, OutputStream outStream, Context context) throws IOException {
        if (value == null) {
            throw new CoderException("cannot encode a null String");
        }
        if (context.isWholeStream) {
            byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
            if (outStream instanceof ExposedByteArrayOutputStream) {
                ((ExposedByteArrayOutputStream) outStream).writeAndOwn(bytes);
            } else {
                outStream.write(bytes);
            }
        } else {
            writeString(value, new DataOutputStream(outStream));
        }
    }

    @Override
    public String decode(InputStream inStream, Context context) throws IOException {
        if (context.isWholeStream) {
            byte[] bytes = StreamUtils.getBytes(inStream);
            return new String(bytes, StandardCharsets.UTF_8);
        } else {
            try {
                return readString(new DataInputStream(inStream));
            } catch (EOFException | UTFDataFormatException exn) {
                // These exceptions correspond to decoding problems, so change
                // what kind of exception they're branded as.
                throw new CoderException(exn);
            }
        }
    }

    /**
     * {@inheritDoc}
     *
     * @return {@code true}. This coder is injective.
     */
    @Override
    public boolean consistentWithEquals() {
        return true;
    }

    /**
     * {@inheritDoc}
     *
     * @return the byte size of the UTF-8 encoding of the a string or, in a nested context,
     * the byte size of the encoding plus the encoded length prefix.
     */
    @Override
    protected long getEncodedElementByteSize(String value, Context context) throws Exception {
        if (value == null) {
            throw new CoderException("cannot encode a null String");
        }
        if (context.isWholeStream) {
            return Utf8.encodedLength(value);
        } else {
            CountingOutputStream countingStream = new CountingOutputStream(ByteStreams.nullOutputStream());
            DataOutputStream stream = new DataOutputStream(countingStream);
            writeString(value, stream);
            return countingStream.getCount();
        }
    }
}