org.apache.hawq.pxf.plugins.hdfs.AvroFileAccessor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hawq.pxf.plugins.hdfs.AvroFileAccessor.java

Source

package org.apache.hawq.pxf.plugins.hdfs;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.hawq.pxf.api.OneRow;
import org.apache.hawq.pxf.api.utilities.InputData;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.*;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;

import java.io.IOException;

import static org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities.getAvroSchema;

/**
 * A PXF Accessor for reading Avro File records
 */
public class AvroFileAccessor extends HdfsSplittableDataAccessor {
    private AvroWrapper<GenericRecord> avroWrapper = null;

    /**
     * Constructs a AvroFileAccessor that creates the job configuration and
     * accesses the avro file to fetch the avro schema
     *
     * @param input all input parameters coming from the client
     * @throws Exception if getting the avro schema fails
     */
    public AvroFileAccessor(InputData input) throws Exception {
        // 1. Call the base class
        super(input, new AvroInputFormat<GenericRecord>());

        // 2. Accessing the avro file through the "unsplittable" API just to get the schema.
        //    The splittable API (AvroInputFormat) which is the one we will be using to fetch
        //    the records, does not support getting the avro schema yet.
        Schema schema = getAvroSchema(conf, inputData.getDataSource());

        // 3. Pass the schema to the AvroInputFormat
        AvroJob.setInputSchema(jobConf, schema);

        // 4. The avroWrapper required for the iteration
        avroWrapper = new AvroWrapper<GenericRecord>();
    }

    @Override
    protected Object getReader(JobConf jobConf, InputSplit split) throws IOException {
        return new AvroRecordReader<Object>(jobConf, (FileSplit) split);
    }

    /**
     * readNextObject
     * The AVRO accessor is currently the only specialized accessor that
     * overrides this method. This happens, because the special
     * AvroRecordReader.next() semantics (use of the AvroWrapper), so it
     * cannot use the RecordReader's default implementation in
     * SplittableFileAccessor
     */
    @Override
    public OneRow readNextObject() throws IOException {
        /** Resetting datum to null, to avoid stale bytes to be padded from the previous row's datum */
        avroWrapper.datum(null);
        if (reader.next(avroWrapper, NullWritable.get())) { // There is one more record in the current split.
            return new OneRow(null, avroWrapper.datum());
        } else if (getNextSplit()) { // The current split is exhausted. try to move to the next split.
            return reader.next(avroWrapper, NullWritable.get()) ? new OneRow(null, avroWrapper.datum()) : null;
        }

        // if neither condition was met, it means we already read all the records in all the splits, and
        // in this call record variable was not set, so we return null and thus we are signaling end of
        // records sequence - in this case avroWrapper.datum() will be null
        return null;
    }
}