com.ning.metrics.serialization.hadoop.pig.SmileStorage.java Source code

Introduction

Here is the source code for com.ning.metrics.serialization.hadoop.pig.SmileStorage.java
Source

/*
 * Copyright 2010-2011 Ning, Inc.
 *
 * Ning licenses this file to you under the Apache License, version 2.0
 * (the "License"); you may not use this file except in compliance with the
 * License.  You may obtain a copy of the License at:
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package com.ning.metrics.serialization.hadoop.pig;

import com.ning.metrics.goodwill.access.GoodwillAccessor;
import com.ning.metrics.goodwill.access.GoodwillSchema;
import com.ning.metrics.goodwill.access.GoodwillSchemaField;
import com.ning.metrics.serialization.event.SmileEnvelopeEvent;
import com.ning.metrics.serialization.hadoop.SmileInputFormat;
import com.ning.metrics.serialization.schema.SchemaFieldType;

import com.fasterxml.jackson.databind.JsonNode;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutionException;

public class SmileStorage extends LoadFunc implements LoadMetadata {
    private static final Logger log = LoggerFactory.getLogger(SmileStorage.class);

    private final TupleFactory factory = TupleFactory.getInstance();
    private final GoodwillSchema schema;

    private RecordReader reader;
    private PigSplit split;

    public SmileStorage(final String schemaName) throws IOException {
        this(schemaName, System.getProperty("goodwill.host", "127.0.0.1"),
                System.getProperty("goodwill.port", "8080"));
    }

    public SmileStorage(final String schemaName, final String goodwillHost, final String goodwillPort)
            throws IOException {
        GoodwillAccessor goodwillAccessor = null;
        try {
            goodwillAccessor = new GoodwillAccessor(goodwillHost, Integer.parseInt(goodwillPort));
            schema = goodwillAccessor.getSchema(schemaName).get();

            if (schema == null) {
                throw new IOException(String.format("Unable to find schema %s in Goodwill (%s:%S)", schemaName,
                        goodwillHost, goodwillPort));
            }
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while trying to fetch Smile schema", e);
        } catch (ExecutionException e) {
            throw new IOException("Exception while trying to fetch Smile schema", e);
        } finally {
            if (goodwillAccessor != null) {
                goodwillAccessor.close();
            }
        }
    }

    /**
     * Communicate to the loader the location of the object(s) being loaded.
     * The location string passed to the LoadFunc here is the return value of
     * {@link org.apache.pig.LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}. Implementations
     * should use this method to communicate the location (and any other information)
     * to its underlying InputFormat through the Job object.
     * <p/>
     * This method will be called in the backend multiple times. Implementations
     * should bear in mind that this method is called multiple times and should
     * ensure there are no inconsistent side effects due to the multiple calls.
     *
     * @param location Location as returned by
     *                 {@link org.apache.pig.LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
     * @param job      the {@link org.apache.hadoop.mapreduce.Job} object
     *                 store or retrieve earlier stored information from the {@link org.apache.pig.impl.util.UDFContext}
     * @throws java.io.IOException if the location is not valid.
     */
    @Override
    public void setLocation(final String location, final Job job) throws IOException {
        FileInputFormat.setInputPaths(job, location);
    }

    /**
     * This will be called during planning on the front end. This is the
     * instance of InputFormat (rather than the class name) because the
     * load function may need to instantiate the InputFormat in order
     * to control how it is constructed.
     *
     * @return the InputFormat associated with this loader.
     * @throws java.io.IOException if there is an exception during InputFormat
     *                             construction
     */
    @Override
    public InputFormat getInputFormat() throws IOException {
        return new SmileInputFormat();
    }

    /**
     * Initializes LoadFunc for reading data.  This will be called during execution
     * before any calls to getNext.  The RecordReader needs to be passed here because
     * it has been instantiated for a particular InputSplit.
     *
     * @param reader {@link org.apache.hadoop.mapreduce.RecordReader} to be used by this instance of the LoadFunc
     * @param split  The input {@link org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit} to process
     * @throws java.io.IOException if there is an exception during initialization
     */
    @Override
    public void prepareToRead(final RecordReader reader, final PigSplit split) throws IOException {
        this.reader = reader;
        this.split = split;
    }

    /**
     * Retrieves the next tuple to be processed. Implementations should NOT reuse
     * tuple objects (or inner member objects) they return across calls and
     * should return a different tuple object in each call.
     *
     * @return the next tuple to be processed or null if there are no more tuples
     *         to be processed.
     * @throws java.io.IOException if there is an exception while retrieving the next
     *                             tuple
     */
    @Override
    public Tuple getNext() throws IOException {
        try {
            if (reader == null || !reader.nextKeyValue()) {
                return null;
            }

            final Object value = reader.getCurrentValue();

            if (value instanceof SmileEnvelopeEvent) {
                final SmileEnvelopeEvent envelope = (SmileEnvelopeEvent) value;
                final JsonNode data = (JsonNode) envelope.getData();

                final Tuple tuple = factory.newTuple(data.size());
                int i = 0;
                for (final GoodwillSchemaField field : schema.getSchema()) {
                    final JsonNode node = data.get(field.getName());
                    tuple.set(i, getJsonValue(field.getType(), node));
                    i++;
                }

                return tuple;
            } else {
                throw new IOException(String.format("Expected SmileEnvelopeEvent, not %s", value.getClass()));
            }
        } catch (NullPointerException e) {
            String splitInfo = "<no split info>";
            if (split != null) {
                splitInfo = split.toString();
            }
            log.error(String.format("Corrupt Smile file (%s), ignoring the rest of the input", splitInfo), e);
        } catch (com.fasterxml.jackson.core.JsonParseException e) {
            String splitInfo = "<no split info>";
            if (split != null) {
                splitInfo = split.toString();
            }
            log.error(String.format("Corrupt Smile file (%s), ignoring the rest of the input", splitInfo), e);
            return null;
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
        }

        return null;
    }

    private Object getJsonValue(final SchemaFieldType type, final JsonNode node) {
        switch (type) {
        case BOOLEAN:
            return node.intValue();
        case BYTE:
            return new Byte(node.textValue());
        case SHORT:
        case INTEGER:
            return node.intValue();
        case LONG:
        case DATE:
            return node.longValue();
        case DOUBLE:
            return node.doubleValue();
        case IP:
        case STRING:
        default:
            return node.textValue();
        }
    }

    private byte getPigType(final SchemaFieldType type) {
        switch (type) {
        case BOOLEAN:
            return DataType.INTEGER;
        case BYTE:
            return DataType.BYTE;
        case SHORT:
        case INTEGER:
            return DataType.INTEGER;
        case LONG:
        case DATE:
            return DataType.LONG;
        case DOUBLE:
            return DataType.DOUBLE;
        case IP:
        case STRING:
        default:
            return DataType.CHARARRAY;
        }
    }

    /**
     * Get a schema for the data to be loaded.
     *
     * @param location Location as returned by
     *                 {@link org.apache.pig.LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
     * @param job      The {@link org.apache.hadoop.mapreduce.Job} object - this should be used only to obtain
     *                 cluster properties through {@link org.apache.hadoop.mapreduce.Job#getConfiguration()} and not to set/query
     *                 any runtime job information.
     * @return schema for the data to be loaded. This schema should represent
     *         all tuples of the returned data.  If the schema is unknown or it is
     *         not possible to return a schema that represents all returned data,
     *         then null should be returned. The schema should not be affected by pushProjection, ie.
     *         getSchema should always return the original schema even after pushProjection
     * @throws java.io.IOException if an exception occurs while determining the schema
     */
    @Override
    public ResourceSchema getSchema(final String location, final Job job) throws IOException {
        final List<Schema.FieldSchema> schemaList = new ArrayList<Schema.FieldSchema>();
        for (final GoodwillSchemaField field : schema.getSchema()) {
            schemaList.add(new Schema.FieldSchema(field.getName(), getPigType(field.getType())));
        }

        return new ResourceSchema(new Schema(schemaList));
    }

    /**
     * Get statistics about the data to be loaded.  If no statistics are
     * available, then null should be returned.
     *
     * @param location Location as returned by
     *                 {@link org.apache.pig.LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
     * @param job      The {@link org.apache.hadoop.mapreduce.Job} object - this should be used only to obtain
     *                 cluster properties through {@link org.apache.hadoop.mapreduce.Job#getConfiguration()} and not to set/query
     *                 any runtime job information.
     * @return statistics about the data to be loaded.  If no statistics are
     *         available, then null should be returned.
     * @throws java.io.IOException if an exception occurs while retrieving statistics
     */
    @Override
    public ResourceStatistics getStatistics(final String location, final Job job) throws IOException {
        return null;
    }

    /**
     * Find what columns are partition keys for this input.
     *
     * @param location Location as returned by
     *                 {@link org.apache.pig.LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
     * @param job      The {@link org.apache.hadoop.mapreduce.Job} object - this should be used only to obtain
     *                 cluster properties through {@link org.apache.hadoop.mapreduce.Job#getConfiguration()} and not to set/query
     *                 any runtime job information.
     * @return array of field names of the partition keys. Implementations
     *         should return null to indicate that there are no partition keys
     * @throws java.io.IOException if an exception occurs while retrieving partition keys
     */
    @Override
    public String[] getPartitionKeys(final String location, final Job job) throws IOException {
        return null;
    }

    /**
     * Set the filter for partitioning.  It is assumed that this filter
     * will only contain references to fields given as partition keys in
     * getPartitionKeys. So if the implementation returns null in
     * {@link #getPartitionKeys(String, org.apache.hadoop.mapreduce.Job)}, then this method is not
     * called by Pig runtime. This method is also not called by the Pig runtime
     * if there are no partition filter conditions.
     *
     * @param partitionFilter that describes filter for partitioning
     * @throws java.io.IOException if the filter is not compatible with the storage
     *                             mechanism or contains non-partition fields.
     */
    @Override
    public void setPartitionFilter(final Expression partitionFilter) throws IOException {
    }
}