Java tutorial
/* * Licensed to Think Big Analytics, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Think Big Analytics, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Copyright 2010 Think Big Analytics. All Rights Reserved. */ package tap.core; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.*; import org.apache.avro.io.*; import org.apache.avro.mapred.*; import org.apache.avro.specific.SpecificDatumReader; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.ReflectionUtils; import org.codehaus.jackson.JsonParseException; import tap.Phase; import tap.Pipe; import tap.TapMapper; import tap.core.io.BinaryKey; import tap.core.mapreduce.input.TapfileInputFormat; import tap.core.mapreduce.io.ProtobufWritable; import tap.formats.FileFormat; import tap.formats.Formats; import tap.formats.avro.JsonToGenericRecord; import tap.util.ObjectFactory; import tap.util.ReflectUtils; /* nb, KO, VO are set in Phase. Since these values determine the types for the OutputCollector, * we cannot have map-only tasks write to tap files, unless we change this. * conf.setMapOutputKeyClass(AvroKey.class); conf.setMapOutputValueClass(AvroValue.class); */ @SuppressWarnings("deprecation") public class MapperBridge<KEY, VALUE, IN, OUT, KO, VO> extends MapReduceBase implements org.apache.hadoop.mapred.Mapper<KEY, VALUE, KO, VO> { private static final int SNIFF_HEADER_SIZE = 1000; private TapMapper<IN, OUT> mapper; private boolean isMapOnly; //private OUT out; private TapContext<OUT> context; private Schema schema; private String groupBy; private String sortBy; private boolean isTextInput = false; private boolean isStringInput = false; private boolean isJsonInput = false; private boolean isProtoInput = false; private Schema inSchema; private int parseErrors = 0; private BinaryEncoder encoder = null; private EncoderFactory factory = new EncoderFactory(); // TODO: make this configurable private int maxAllowedErrors = 1000; private Pipe<OUT> outPipe = null; @SuppressWarnings("unchecked") @Override public void configure(JobConf conf) { this.mapper = ReflectionUtils.newInstance(conf.getClass(Phase.MAPPER, TapMapper.class, TapMapper.class), conf); this.isMapOnly = conf.getNumReduceTasks() == 0; try { determineInputFormat(conf); determineOutputFormat(conf); this.groupBy = conf.get(Phase.GROUP_BY); this.sortBy = conf.get(Phase.SORT_BY); } catch (Exception e) { if (e instanceof RuntimeException) throw (RuntimeException) e; throw new RuntimeException(e); } mapper.setConf(conf); mapper.init(conf.get("map.input.file")); } @SuppressWarnings("unchecked") @Override public void map(KEY wrapper, VALUE value, OutputCollector<KO, VO> collector, Reporter reporter) throws IOException { if (this.context == null) { KeyExtractor<BinaryKey, OUT> extractor = new ReflectionKeyExtractor<OUT>(schema, groupBy, sortBy); this.context = new TapContext<OUT>(new Collector(collector, extractor), reporter); } bindContextToPipe(collector, reporter); invokeMapper(wrapper, value, reporter); } /** * @param conf * @throws IOException * @throws FileNotFoundException */ private void determineInputFormat(JobConf conf) throws FileNotFoundException, IOException { /** * Compare mapper input file signature with Hadoop configured class */ FileFormat ff = sniffMapInFormat(conf); if (!ff.isCompatible(conf.getInputFormat())) { throw new IllegalArgumentException("Map input format not compatible with file format."); } //otherwise assume it is avro? if (conf.getInputFormat() instanceof TextInputFormat) { Class<?> inClass = conf.getClass(Phase.MAP_IN_CLASS, Object.class, Object.class); if (inClass == String.class) { isStringInput = true; } else if (inClass == Text.class) { isTextInput = true; } else { isJsonInput = true; inSchema = ReflectUtils.getSchema((IN) ReflectionUtils.newInstance(inClass, conf)); } } isProtoInput = conf.getInputFormat() instanceof TapfileInputFormat; } /** * @param conf */ @SuppressWarnings("unchecked") private void determineOutputFormat(JobConf conf) throws Exception { Class<?> outClass = conf.getClass(Phase.MAP_OUT_CLASS, Object.class, Object.class); OUT out; out = (OUT) ObjectFactory.newInstance(outClass); /*OUT out = (OUT) ReflectionUtils.newInstance( conf.getClass(Phase.MAP_OUT_CLASS, Object.class, Object.class), conf); */ outPipe = new Pipe<OUT>(out); schema = ReflectUtils.getSchema(out); } @SuppressWarnings("unchecked") private class Collector<K> extends AvroMultiCollector<OUT> { private final AvroWrapper<OUT> wrapper = new AvroWrapper<OUT>(null); private final AvroKey<K> keyWrapper = new AvroKey<K>(null); private final AvroValue<OUT> valueWrapper = new AvroValue<OUT>(null); private final KeyExtractor<K, OUT> extractor; private final K key; private OutputCollector<KO, VO> collector; //map only jobs that write to tapproto file private ProtobufWritable protobufWritable = new ProtobufWritable(); public Collector(OutputCollector<KO, VO> collector, KeyExtractor<K, OUT> extractor) { this.collector = collector; this.extractor = extractor; key = extractor.getProtypeKey(); keyWrapper.datum(key); } public void collect(OUT datum) throws IOException { if (isMapOnly) { //if the the output from the mapper is a protobuf message, then KO is a BinaryKey and VO is a ProtobufWritable if (datum instanceof com.google.protobuf.Message) { extractor.setKey(datum, key); protobufWritable.setConverter(datum.getClass()); protobufWritable.set(datum); collector.collect((KO) key, (VO) protobufWritable); } else { wrapper.datum(datum); collector.collect((KO) wrapper, (VO) NullWritable.get()); } } else { extractor.setKey(datum, key); valueWrapper.datum(datum); collector.collect((KO) keyWrapper, (VO) valueWrapper); } } } /** * @param wrapper * @param value * @param reporter * @throws IOException */ private void invokeMapper(KEY wrapper, VALUE value, Reporter reporter) throws IOException { if (isTextInput) { map((IN) value); } else if (isStringInput) { map((IN) ((Text) value).toString()); } else if (isProtoInput) { map((IN) ((ProtobufWritable) value).get()); } else if (isJsonInput) { String json = ((Text) value).toString(); if (shouldSkip(json)) return; // inefficient implementation of json to avro... // more efficient would be JsonToClass.jsonToRecord: // mapper.map((IN) JsonToClass.jsonToRecord(json, inSchema), out, // context); // silly conversion approach - serialize then deserialize try { GenericContainer c = JsonToGenericRecord.jsonToRecord(json, inSchema); GenericDatumWriter<GenericContainer> writer = new GenericDatumWriter<GenericContainer>(inSchema); ByteArrayOutputStream bos = new ByteArrayOutputStream(); writer.setSchema(inSchema); encoder = factory.binaryEncoder(bos, encoder); writer.write(c, encoder); byte[] data = bos.toByteArray(); GenericDatumReader<IN> reader = new SpecificDatumReader<IN>(inSchema); reader.setSchema(inSchema); IN converted = reader.read(null, DecoderFactory.defaultFactory().createBinaryDecoder(data, null)); map(converted); } catch (JsonParseException jpe) { System.err.println("Failed to parse " + json + ": " + jpe.getMessage()); reporter.incrCounter("ColHadoopMapper", "json-parse-error", 1L); if (++parseErrors > maxAllowedErrors) { throw new RuntimeException(jpe); } } } else { map(((AvroWrapper<IN>) wrapper).datum()); } } private void map(IN value) { mapper.map(value, outPipe); } /** * Bind the output to the Tap Context * * @param collector * @param reporter */ private void bindContextToPipe(OutputCollector<KO, VO> collector, Reporter reporter) { if (this.outPipe != null && this.outPipe.getContext() == null) { if (this.context == null) { KeyExtractor<BinaryKey, OUT> extractor = new ReflectionKeyExtractor<OUT>(schema, groupBy, sortBy); outPipe.setContext(new TapContext<OUT>(new Collector(collector, extractor), reporter)); } else { outPipe.setContext(this.context); } } } private boolean shouldSkip(String json) { int i; int len = json.length(); for (i = 0; i < len; i++) if (!Character.isWhitespace(json.charAt(i))) break; if (i == len) return true; // blank line return (json.charAt(i) == '#' || json.charAt(i) == '/' && len > (i + 1) && json.charAt(i + 1) == '/'); // skip comments } /** * Open file and read header to determine file format * * @param conf * @throws IOException * @throws FileNotFoundException */ private FileFormat sniffMapInFormat(JobConf conf) throws IOException, FileNotFoundException { { FileFormat returnFormat = Formats.UNKNOWN_FORMAT.getFileFormat(); Path path = new Path(conf.get("map.input.file")); // System.out.println("PATH is " + path); FileSystem fs = path.getFileSystem(conf); FSDataInputStream in = null; try { in = fs.open(path); byte[] header = readHeader(in); returnFormat = determineFileFormat(header); } finally { if (in != null) in.close(); } return returnFormat; /* InputFormat inputFormat = conf.getInputFormat(); System.out .println("tap.core.MapperBridge: local file path " + path); System.out.println("tap.core.MapperBridge: File format " + returnFormat.toString()); System.out.println("tap.core.MapperBridge: format extension " + returnFormat.fileExtension()); */ } } /** * Based on file header values return File format. * * @param returnFormat * @param header * @return */ private FileFormat determineFileFormat(byte[] header) { for (Formats format : Formats.values()) { if (format.getFileFormat().signature(header)) { return format.getFileFormat(); } } return Formats.UNKNOWN_FORMAT.getFileFormat(); } /** * Read first N bytes from normal file system file. * @param file * @return byte buffer containing first SNIFF_HEADER_SIZE bytes. * @throws FileNotFoundException * @throws IOException */ private byte[] readHeader(FSDataInputStream inputStream) throws FileNotFoundException, IOException { byte[] header = new byte[SNIFF_HEADER_SIZE]; inputStream.read(header); inputStream.close(); return header; } @Override public void close() throws IOException { mapper.close(outPipe); } }