Java tutorial
/** * Copyright (C) 2011 Metamarkets http://metamx.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.metamx.milano.pig; import com.google.protobuf.ByteString; import com.google.protobuf.DescriptorProtos; import com.google.protobuf.Descriptors; import com.google.protobuf.DynamicMessage; import com.google.protobuf.Message; import com.metamx.milano.hadoop.MilanoProtoFileOutputFormat; import com.metamx.milano.proto.MilanoTool; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.log4j.Logger; import org.apache.pig.ResourceSchema; import org.apache.pig.StoreFunc; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.util.UDFContext; import java.io.IOException; import java.util.Iterator; import java.util.Properties; /** * */ public class MilanoStoreFunc extends StoreFunc { private Logger log = Logger.getLogger(MilanoStoreFunc.class); private RecordWriter<String, Message> recordWriter; private String udfSignature; private Descriptors.Descriptor schemaDescriptor; /** * Set the UDF Signature which is used to store the ProtoBuf Schema between the client and mapper/reducer. * Called in both the client and mapper/reducer context. * * @param signature The UDF signature. */ @Override public void setStoreFuncUDFContextSignature(String signature) { udfSignature = signature; } /* *** Client side calls *** */ /** * Here we take the ResourceSchema and generate the ProtoBuf Descriptor. * We take the Descriptor, base64 encode it, and store it into the UDFContext. * The majority of the work of this function is done in getProtoSchema * * @param schema The ResourceSchema supplied by Pig. * * @throws IOException Thrown if the descriptor can not be validated. */ @Override public void checkSchema(ResourceSchema schema) throws IOException { Properties props = getUDFProps(); log.debug( String.format("Generating Descriptor and storing base64 into \"milano.pig.proto.schema.base64\"")); DescriptorProtos.DescriptorProto protoSchema = getProtoSchema(schema); DescriptorProtos.FileDescriptorProto fileDescriptorProto = DescriptorProtos.FileDescriptorProto.newBuilder() .addMessageType(protoSchema) // .setName("milano_dynamic_type.proto") // .setPackage("metamx.milano.dynamic") .build(); props.setProperty("milano.pig.proto.schema.base64", MilanoTool .with(protoSchema, fileDescriptorProto, DescriptorProtos.FileDescriptorSet.getDefaultInstance()) .getBase64()); } /** * This takes a Pig ResourceSchema and build a ProtoBuf DescriptorProto out of it. * * @param schema The ResourceSchema to create a DescriptorProto for. * * @return A DescriptorProto matching the ResourceSchema. Note that this is not a Descriptor but the Proto Object representing one. * * @throws IOException Thrown if the ResourceSchema contains an unsupported or unknown type. */ private DescriptorProtos.DescriptorProto getProtoSchema(ResourceSchema schema) throws IOException { DescriptorProtos.DescriptorProto.Builder builder = DescriptorProtos.DescriptorProto.newBuilder(); builder.setName("PigTupleSchema"); int index = 1; for (ResourceSchema.ResourceFieldSchema fieldSchema : schema.getFields()) { String fieldName = fieldSchema.getName(); fieldName = fieldName.substring(fieldName.lastIndexOf(':') + 1); log.debug(String.format("Starting field [%s] of type [%s] for index [%s] with full name [%s]", fieldName, DataType.findTypeName(fieldSchema.getType()), index, fieldSchema.getName())); switch (fieldSchema.getType()) { // String Types handled with addStringField case DataType.CHARARRAY: addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_STRING, index, fieldName); break; case DataType.BYTEARRAY: addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_BYTES, index, fieldName); break; case DataType.DOUBLE: addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_DOUBLE, index, fieldName); break; case DataType.FLOAT: addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_FLOAT, index, fieldName); break; case DataType.INTEGER: addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_INT32, index, fieldName); break; case DataType.LONG: addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_INT64, index, fieldName); break; // Containers These are totally untested. case DataType.TUPLE: // Recurse log.debug(String.format("Adding tuple field [%s]", fieldName)); builder.addFieldBuilder().mergeFrom(getProtoSchema(fieldSchema.getSchema())).setName(fieldName) .setNumber(index).build(); break; case DataType.BAG: // Create list and recurse log.debug(String.format("Adding bag field [%s]", fieldName)); ResourceSchema.ResourceFieldSchema[] fs = fieldSchema.getSchema().getFields(); if (fs.length != 1 || fs[0].getType() != DataType.TUPLE) { throw logAndReturnIOE("Found a bag without a tuple inside!"); } builder.addFieldBuilder().setOptions(builder.addFieldBuilder().getOptionsBuilder().setPacked(true)) .setLabel(DescriptorProtos.FieldDescriptorProto.Label.LABEL_REPEATED) .mergeFrom(getProtoSchema(fieldSchema.getSchema())).setName(fieldName).setNumber(index) .build(); break; // Unsupported Types case DataType.MAP: throw logAndReturnIOE("The Pig MAP type is currently not supported."); case DataType.NULL: throw logAndReturnIOE("The Pig NULL type is currently not supported."); case DataType.ERROR: throw logAndReturnIOE("The Pig ERROR type is currently not supported."); case DataType.BIGCHARARRAY: case DataType.GENERIC_WRITABLECOMPARABLE: throw new UnsupportedOperationException("Pig datatype not supported."); // Pig Internal Types case DataType.BOOLEAN: case DataType.BYTE: case DataType.INTERNALMAP: throw logAndReturnIOE("Use of internal type in schema definition."); // Unknown (duh) case DataType.UNKNOWN: default: throw logAndReturnIOE("Unknown data type."); } index++; } return builder.build(); } /** * This adds a field to a DescriptorProto being built. * * @param builder The builder to add the field to. * @param fieldType The Type of field to add. * @param index The number to assign the field. * @param name The name to give the field. */ private void addField(final DescriptorProtos.DescriptorProto.Builder builder, final DescriptorProtos.FieldDescriptorProto.Type fieldType, final int index, final String name) { log.debug(String.format("Adding field [%s] of type [%s]", name, fieldType.name())); builder.addFieldBuilder().setType(fieldType).setName(name).setNumber(index).build(); } /* *** Mapper/Reducer side calls *** */ /** * This does the setup for the mapper/reducer side. * * @param location The output path. * @param job The job config. * * @throws IOException Currently not thrown, but is part of the overridden signature. */ @Override public void setStoreLocation(String location, Job job) throws IOException { FileOutputFormat.setOutputPath(job, new Path(location)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); Properties props = getUDFProps(); job.getConfiguration().set("com.metamx.milano.proto.descriptor.base64", (String) props.get("milano.pig.proto.schema.base64")); } /** * This retrieves the OutputFormat for use by Hadoop. * * @return An {@link com.metamx.milano.hadoop.MilanoProtoFileOutputFormat} * * @throws IOException Currently not thrown, but part of the overridden signature. */ @Override public OutputFormat getOutputFormat() throws IOException { assert udfSignature != null; log.debug("Getting OutputFormat"); return new MilanoProtoFileOutputFormat(); } /** * Prepare this for writing to the OutputFormat. * This expects "milano.pig.proto.schema.base64" to be present in the UDFContext. * * @param writer The RecordWriter supplied by the OutputFormat returned from getOutputFormat. * * @throws IOException Thrown when the UDFContext contains an invalid or non-existent base64 encoded DescriptorProto bytes. */ @Override public void prepareToWrite(RecordWriter writer) throws IOException { log.debug("Preparing to write"); @SuppressWarnings("unchecked") RecordWriter<String, Message> tempWriter = writer; recordWriter = tempWriter; Properties props = getUDFProps(); schemaDescriptor = MilanoTool.withBase64((String) props.get("milano.pig.proto.schema.base64")) .getDescriptor(); } /** * This takes a Pig Tuple, converts it to a ProtoBuf Message, and writes it to the RecordWriter. * The majority of the work for this function is done by serializeTuple. * * @param tuple The Tuple to convert and write out. * * @throws IOException Thrown when interrupted or if serializeTuple throws the same. */ @Override public void putNext(Tuple tuple) throws IOException { DynamicMessage.Builder builder = DynamicMessage.newBuilder(schemaDescriptor); serializeTuple(tuple, builder, schemaDescriptor); try { recordWriter.write("", builder.build()); } catch (InterruptedException e) { throw new IOException(e); } } /** * This takes a Tuple and sets up messageBuilder. * The messageBuilder could be for a different type than the tupleDescriptor. * The tupleDescriptor is used to parse the incoming Tuples. * The messageBuilder is the type to actually write out to the file. * * @param tuple The Tuple to process. * @param messageBuilder A message builder for the output Message type. * @param tupleDescriptor A descriptor used to decode the Tuple (as generated by getProtoSchema). * * @throws IOException Thrown when a unsupported type is encountered in the tupleDescriptor. */ private void serializeTuple(final Tuple tuple, final Message.Builder messageBuilder, final Descriptors.Descriptor tupleDescriptor) throws IOException { Iterator<Object> tupleIterator = tuple.getAll().iterator(); Descriptors.Descriptor messageDescriptor = messageBuilder.getDescriptorForType(); // Here we process the two descriptors in parallel. For the current version this is actually unnecessary, // but this is in anticipation of being able to specify the actual message without using the dynamic TypeMetadata. for (Descriptors.FieldDescriptor tupleFieldDescriptor : tupleDescriptor.getFields()) { Object item = tupleIterator.next(); Message.Builder itemBuilder; if (item == null) { continue; } Descriptors.FieldDescriptor messageFieldDescriptor = messageDescriptor .findFieldByName(tupleFieldDescriptor.getName()); switch (tupleFieldDescriptor.getJavaType()) { case INT: case LONG: case FLOAT: case DOUBLE: case STRING: messageBuilder.setField(messageFieldDescriptor, item); break; case BYTE_STRING: // Here we convert the Pig object into bytes to be serialized, this could contain arbitrary binary data so it must be handled differently. messageBuilder.setField(messageFieldDescriptor, ByteString.copyFrom(DataType.toBytes(item))); break; // This functionality is totally untested. case MESSAGE: Descriptors.Descriptor subTupleDescriptor = tupleFieldDescriptor.getMessageType(); if (tupleFieldDescriptor.isRepeated()) { //This is a bag for (Object subTuple : ((DataBag) item)) { itemBuilder = messageBuilder.newBuilderForField(messageFieldDescriptor); serializeTuple((Tuple) subTuple, itemBuilder, subTupleDescriptor); messageBuilder.addRepeatedField(messageFieldDescriptor, itemBuilder); } } else { //This is a tuple itemBuilder = messageBuilder.newBuilderForField(messageFieldDescriptor); serializeTuple((Tuple) item, itemBuilder, subTupleDescriptor); messageBuilder.setField(tupleFieldDescriptor, itemBuilder); } break; case BOOLEAN: log.error("Boolean field in generated tuple descriptor"); throw logAndReturnIOE("Boolean field in generated tuple descriptor."); case ENUM: log.error("Enum field in generated tuple descriptor"); throw logAndReturnIOE("Enum field in generated tuple descriptor."); default: throw logAndReturnIOE("Unknown data type."); } } } /** * Helper function to log and then return an IOE. * * @param message The message to report to both the logger and set in the IOE. * * @return An IOE with the given message. */ private IOException logAndReturnIOE(String message) { IOException ioe = new IOException(message); log.error(message, ioe); return ioe; } /** * Retrieves the Properties object from the UDFContext. * * @return The properties for this UDF. */ private Properties getUDFProps() { UDFContext udfContext = UDFContext.getUDFContext(); return udfContext.getUDFProperties(this.getClass(), new String[] { udfSignature }); } }