Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.avro.mapred; import java.io.IOException; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RecordReader; /** * An {@code InputFormat} for Avro data files that * is compatible with Hadoop Pipes. * <p> * This is a default implementation for Avro files; if parsing a Avro file that * deserializes to a specific class, a new {@link InputFormat} extending this * one should be used instead. The new {@code InputFormat} should also pass a * new implementation of {@link KeyValueGetter} meant for the specific schema * to the {@link PipesCompatibleAvroRecordReader}. * <p> * Uses avro.mapred.key.field.name and avro.mapred.value.field.name properties * to identify the key and value fields in an Avro file. By default, they are * "key" and "value". These fields must be in the bytes, string, enum, or fixed * Avro data type, and must not be nested. * <p> * By default, when pointed at a directory, this will silently skip over any * files in it that do not have .avro extension. To instead include all files, * set the avro.mapred.ignore.inputs.without.extension property to false. * * @param <T> the type of the input object */ public class PipesCompatibleAvroInputFormat<T> extends FileInputFormat<BytesWritable, BytesWritable> { private final AvroInputFormat<T> avroInputFormat = new AvroInputFormat<T>(); /** The name of the key field in the Avro file */ public static final String KEY_FIELD_NAME = "avro.mapred.key.field.name"; /** The default name of the key field in the Avro file */ public static final String KEY_FIELD_NAME_DEFAULT = "key"; /** The name of the value field in the Avro file */ public static final String VALUE_FIELD_NAME = "avro.mapred.value.field.name"; /** The default name of the value field in the Avro file */ public static final String VALUE_FIELD_NAME_DEFAULT = "value"; @Override protected FileStatus[] listStatus(JobConf job) throws IOException { return avroInputFormat.listStatus(job); } /** * Returns the {@code PipesCompatibleAvroRecordReader} to be used * * @return the PipesCompatibleAvroRecordReader to be used * @throws IOException if instantiating the * PipesCompatibleAvroRecordReader fails */ @Override public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); return new PipesCompatibleAvroRecordReader<T>(job, (FileSplit) split, new AvroKeyValueGetter<T>(job.get(KEY_FIELD_NAME, KEY_FIELD_NAME_DEFAULT), job.get(VALUE_FIELD_NAME, VALUE_FIELD_NAME_DEFAULT))); } }