com.datasalt.pangool.utils.TupleToAvroRecordConverter.java Source code

Introduction

Here is the source code for com.datasalt.pangool.utils.TupleToAvroRecordConverter.java
Source

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.utils;

import java.io.IOException;
import java.nio.ByteBuffer;

import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.serializer.Serializer;

import com.datasalt.pangool.PangoolRuntimeException;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.serialization.HadoopSerialization;
import com.datasalt.pangool.tuplemr.SerializationInfo;
import com.datasalt.pangool.tuplemr.serialization.TupleSerialization;

public class TupleToAvroRecordConverter {

    //serialization in "io.serializations"
    private HadoopSerialization hadoopSer;

    //custom serializers for OBJECT fields
    @SuppressWarnings("rawtypes")
    private Serializer[] customSerializers;

    private org.apache.avro.Schema avroSchema;
    private Schema pangoolSchema;
    private DataOutputBuffer[] buffers;
    private boolean schemaValidation;

    public TupleToAvroRecordConverter(Schema pangoolSchema, Configuration conf) {
        this.pangoolSchema = pangoolSchema;
        this.avroSchema = AvroUtils.toAvroSchema(pangoolSchema);
        try {
            this.hadoopSer = new HadoopSerialization(conf);
        } catch (IOException e) {
            throw new PangoolRuntimeException(e);
        }
        this.customSerializers = SerializationInfo.getSerializers(pangoolSchema, conf);
        initBuffers();
        schemaValidation = TupleSerialization.getSchemaValidation(conf);
    }

    private void initBuffers() {
        buffers = new DataOutputBuffer[pangoolSchema.getFields().size()];
        for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
            if (field.schema().getType() == org.apache.avro.Schema.Type.BYTES) {
                buffers[field.pos()] = new DataOutputBuffer();
            }
        }

    }

    /**
     * Moves data between a Tuple and an Avro Record
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    public Record toRecord(ITuple tuple, Record reuse) throws IOException {
        Record record = reuse;
        if (record == null) {
            record = new Record(avroSchema);
        }
        if (schemaValidation && !tuple.getSchema().equals(pangoolSchema)) {
            throw new IOException("Tuple '" + tuple + "' " + "contains schema not expected." + "Expected schema '"
                    + pangoolSchema + " and actual: " + tuple.getSchema());
        }
        for (int i = 0; i < pangoolSchema.getFields().size(); i++) {
            Object obj = tuple.get(i);
            Field field = pangoolSchema.getField(i);
            if (obj == null) {
                throw new IOException("Field '" + field.getName() + "' can't be null in tuple:" + tuple);
            }

            switch (field.getType()) {
            case INT:
            case LONG:
            case FLOAT:
            case BOOLEAN:
            case DOUBLE:
            case BYTES:
                record.put(i, obj); //optimistic
                break;
            case OBJECT:
                Serializer customSer = customSerializers[i];
                DataOutputBuffer buffer = buffers[i];
                buffer.reset();
                if (customSer != null) {
                    customSer.open(buffer);
                    customSer.serialize(obj);
                    customSer.close(); //TODO is this safe ?
                } else {
                    hadoopSer.ser(obj, buffer);
                }
                //TODO this byteBuffer instances should be cached and reused
                ByteBuffer byteBuffer = ByteBuffer.wrap(buffer.getData(), 0, buffer.getLength());
                record.put(i, byteBuffer);
                break;
            case ENUM:
                record.put(i, obj.toString());
                break;
            case STRING:
                record.put(i, new Utf8(obj.toString())); //could be directly String ?
                break;
            default:
                throw new IOException("Not correspondence to Avro type from Pangool type " + field.getType());
            }
        }
        return record;
    }

}