org.apache.pig.impl.util.avro.AvroRecordWriter.java Source code

Introduction

Here is the source code for org.apache.pig.impl.util.avro.AvroRecordWriter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.impl.util.avro;

import static org.apache.avro.file.DataFileConstants.DEFAULT_SYNC_INTERVAL;
import static org.apache.avro.file.DataFileConstants.DEFLATE_CODEC;
import static org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL;
import static org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY;
import static org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Map;

import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.pig.data.Tuple;

/**
 * RecordWriter for Avro objects.
 */
public class AvroRecordWriter extends RecordWriter<NullWritable, Object> {

    private Schema schema = null;
    private DataFileWriter<GenericData.Record> writer;
    private Path out;
    private Configuration conf;

    /**
     * Creates new AvroRecordWriter.
     * @param s Schema for the files on this output path
     * @param o Output path
     * @param c Hadoop configuration
     * @throws IOException
     */
    public AvroRecordWriter(/*final Schema s, */final Path o, final Configuration c) throws IOException {
        out = o;
        conf = c;
    }

    // copied from org.apache.avro.mapred.AvroOutputFormat
    static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer, JobConf job)
            throws UnsupportedEncodingException {
        if (FileOutputFormat.getCompressOutput(job)) {
            int level = job.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
            String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
            CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level)
                    : CodecFactory.fromString(codecName);
            writer.setCodec(factory);
        }

        // Do max as core-default.xml has io.file.buffer.size as 4K
        writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY,
                Math.max(job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

        // copy metadata from job
        for (Map.Entry<String, String> e : job) {
            if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
                writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue());
            if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
                writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                        URLDecoder.decode(e.getValue(), "ISO-8859-1").getBytes("ISO-8859-1"));
        }
    }

    @Override
    public void close(final TaskAttemptContext arg0) throws IOException, InterruptedException {
        writer.close();
    }

    @Override
    public void write(final NullWritable key, final Object value) throws IOException, InterruptedException {

        if (value instanceof GenericData.Record) {
            // whoo-hoo! already avro
            writer.append((GenericData.Record) value);
        } else if (value instanceof Tuple) {
            // pack the object into an Avro record
            writer.append(AvroStorageDataConversionUtilities.packIntoAvro((Tuple) value, schema));
        }
    }

    public void prepareToWrite(Schema s) throws IOException {
        if (s == null) {
            throw new IOException(this.getClass().getName() + ".prepareToWrite called with null schema");
        }
        schema = s;
        DatumWriter<GenericData.Record> datumWriter = new GenericDatumWriter<GenericData.Record>(s);
        writer = new DataFileWriter<GenericData.Record>(datumWriter);
        configureDataFileWriter(writer, new JobConf(conf));
        writer.create(s, out.getFileSystem(conf).create(out));

    }

}