Java tutorial
/* * Licensed to Think Big Analytics, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Think Big Analytics, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Copyright 2010 Think Big Analytics. All Rights Reserved. */ package colossal.pipe; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.util.*; import org.apache.avro.Schema; import org.apache.avro.mapred.*; import org.apache.avro.reflect.ReflectData; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; @SuppressWarnings("deprecation") public class ColPhase { private static final String SETTINGS = "col.phase.settings"; public static final String MAPPER = "col.phase.mapper"; public static final String REDUCER = "col.phase.reducer"; public static final String MAP_OUT_CLASS = "col.phase.map.output.class"; public static final String REDUCE_OUT_CLASS = "col.phase.reduce.output.class"; public static final String GROUP_BY = "col.phase.groupby"; public static final String SORT_BY = "col.phase.sortby"; public static final String COMBINER = "col.phase.combiner"; public static final String MAP_OUT_KEY_SCHEMA = "col.phase.map.out.key.schema"; public static final String MAP_OUT_VALUE_SCHEMA = "col.phase.map.out.value.schema"; public static final String MAP_IN_CLASS = "col.phase.map.input.class"; /* * we *allow* for multiple reads, writes, maps, combines, and reduces this would support *manual* optimization of merging we * hope to never use them - instead we'll have simple mappings and rely on an optimizer that will let us do multi input-output */ /** files read by main map/reduce pipeline */ private List<ColFile> mainReads; /** any files read, including side writes */ private List<ColFile> reads; /** files written by main map/reduce pipeline */ private List<ColFile> mainWrites; /** any files written, including side writes */ private List<ColFile> writes; private Class<? extends ColMapper>[] mappers; private Class<? extends ColReducer>[] combiners; private Class<? extends ColReducer>[] reducers; private String groupBy; private String sortBy; private Map<String, String> props = new LinkedHashMap<String, String>(); private String name; private JobConf conf; private Integer deflateLevel; private Map<String, String> textMeta = new TreeMap<String, String>(); public ColPhase() { } public ColPhase(String name) { this.name = name; } public ColFile output() { return output(0); } public ColFile output(int n) { if (mainWrites == null) { // this *should* set up a promise to get the nth output ... throw new UnsupportedOperationException("please define outputs first, for now"); } return mainWrites.get(n); } public ColPhase reads(ColFile... inputs) { return reads(Arrays.asList(inputs)); } public ColPhase reads(Collection<ColFile> inputs) { if (mainReads == null) { mainReads = new ArrayList<ColFile>(inputs); } else { mainReads.addAll(inputs); } return readsSide(inputs); } /** * side writes are files that are written but not as the output of a map/reduce step, instead they are written by tasks or * processes through independent data path */ public ColPhase readsSide(ColFile... sideFiles) { return readsSide(Arrays.asList(sideFiles)); } /** * side writes are files that are written but not as the output of a map/reduce step, instead they are written by tasks or * processes through independent data path */ public ColPhase readsSide(Collection<ColFile> sideFiles) { if (this.reads == null) { this.reads = new ArrayList<ColFile>(sideFiles); } else { this.reads.addAll(sideFiles); } return this; } public ColPhase writes(ColFile... outputs) { return writes(Arrays.asList(outputs)); } public ColPhase writes(Collection<ColFile> outputs) { writesSide(outputs); if (mainWrites == null) { mainWrites = new ArrayList<ColFile>(outputs); } else { mainWrites.addAll(outputs); } return this; } /** * side writes are files that are written but not as the output of a map/reduce step, instead they are written by tasks or * processes through independent data path */ public ColPhase writesSide(ColFile... sideFiles) { return writesSide(Arrays.asList(sideFiles)); } /** * side writes are files that are written but not as the output of a map/reduce step, instead they are written by tasks or * processes through independent data path */ public ColPhase writesSide(Collection<ColFile> sideFiles) { for (ColFile file : sideFiles) { ColPhase p = file.getProducer(); if (p != null && p != this) { throw new IllegalStateException("File " + file + " has multiple producers " + this + ", " + p); } file.setProducer(this); } if (this.writes == null) { this.writes = new ArrayList<ColFile>(sideFiles); } else { this.writes.addAll(sideFiles); } return this; } public ColPhase map(Class<? extends ColMapper>... mappers) { this.mappers = mappers; return this; } public ColPhase combine(Class<? extends ColReducer>... combiners) { this.combiners = combiners; return this; } public ColPhase reduce(Class<? extends ColReducer>... reducers) { this.reducers = reducers; return this; } public ColPhase groupBy(String groupBy) { this.groupBy = groupBy; return this; } public ColPhase sortBy(String sortBy) { this.sortBy = sortBy; return this; } public ColPhase set(String key, String value) { props.put(key, value); return this; } public ColPhase setSettings(Object settings) { return setJson(SETTINGS, settings); } public ColPhase setJson(String key, Object value) { return set(key, toJson(value)); } public ColPhase addMeta(String prefix, String value) { textMeta.put(prefix, value); return this; } private String toJson(Object value) { //TODO throw new UnsupportedOperationException("toJson not yet working"); } public List<PhaseError> plan(ColPipe distPipeline) { List<PhaseError> errors = new ArrayList<PhaseError>(); conf = new JobConf(distPipeline.getConf()); for (Map.Entry<String, String> entry : props.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } Schema mapin = null; Class<?> mapOutClass = null; Class<?> mapInClass = null; Class<? extends ColMapper> mapperClass = null; if (mappers != null && mappers.length > 0) { if (mappers.length > 1) { errors.add(new PhaseError( "Colossal phase/avro currently only supports one mapper per process: " + name)); } else { mapperClass = mappers[0]; conf.set(MAPPER, mapperClass.getName()); Class<?> foundIn = null; for (Method m : mapperClass.getMethods()) { if ("map".equals(m.getName())) { Class<?>[] paramTypes = m.getParameterTypes(); if (paramTypes.length >= 3) { try { // prefer subclass methods to superclass methods if (foundIn == null || foundIn.isAssignableFrom(m.getDeclaringClass())) { if (paramTypes[0] == Object.class) { if (foundIn == m.getDeclaringClass()) { // skip the generated "override" of the generic method continue; } } else { //TODO: handle cases beyond Object where output isn't defined mapInClass = paramTypes[0]; mapin = getSchema(paramTypes[0].newInstance()); } mapOutClass = paramTypes[1]; foundIn = m.getDeclaringClass(); } } catch (Exception e) { errors.add(new PhaseError(e, "Can't create mapper: " + mapperClass)); } } } } } } if (combiners != null && combiners.length > 0) { if (combiners.length > 1) { errors.add(new PhaseError( "Colossal phase/avro currently only supports one combiner per process: " + name)); } else { conf.set(COMBINER, combiners[0].getName()); conf.setCombinerClass(ColHadoopCombiner.class); } } Schema reduceout = null; Class<?> reduceOutClass = null; Class<? extends ColReducer> reducerClass = null; if (reducers != null && reducers.length > 0) { if (reducers.length != 1) { errors.add(new PhaseError( "Colossal phase/avro currently only supports one reducer per process: " + name)); } else { reducerClass = reducers[0]; conf.set(REDUCER, reducers[0].getName()); Class<?> foundIn = null; for (Method m : reducerClass.getMethods()) { if ("reduce".equals(m.getName())) { Class<?>[] paramTypes = m.getParameterTypes(); if (paramTypes.length >= 3) { if (foundIn == null || foundIn.isAssignableFrom(m.getDeclaringClass())) { if (foundIn == m.getDeclaringClass() && paramTypes[1] == Object.class) { // skip the generated "override" of the generic method continue; } // prefer subclass methods to superclass methods reduceOutClass = paramTypes[1]; foundIn = m.getDeclaringClass(); } } } } // XXX validation! } } Object reduceOutProto = null; //TODO: handle cases beyond Object where output isn't defined if ((reduceOutClass == null || reduceOutClass == Object.class) && mainWrites != null && mainWrites.size() > 0) { reduceOutProto = mainWrites.get(0).getPrototype(); reduceOutClass = reduceOutProto.getClass(); } else { try { reduceOutProto = reduceOutClass.newInstance(); } catch (Exception e) { errors.add(new PhaseError(e, "Can't create reducer output class: " + reduceOutClass)); } } if (reduceOutProto != null) reduceout = getSchema(reduceOutProto); conf.set(REDUCE_OUT_CLASS, reduceOutClass.getName()); Schema valueSchema = null; if (mainWrites.size() != 1) { errors.add( new PhaseError("Colossal phase/avro currently only supports one output per process: " + name)); } else { ColFile output = mainWrites.get(0); AvroOutputFormat.setOutputPath(conf, new Path(output.getPath())); if (output.getPrototype() != null) { valueSchema = getSchema(output.getPrototype()); if (reduceout != null) { assert reduceout.equals(valueSchema); // should make an error not assert this! } } else { if (reduceout == null) { errors.add(new PhaseError("No output format defined")); } valueSchema = reduceout; } output.setupOutput(conf); } conf.set(AvroJob.OUTPUT_SCHEMA, valueSchema.toString()); if (deflateLevel != null) AvroOutputFormat.setDeflateLevel(conf, deflateLevel); Object proto = null; if (mainReads != null && mainReads.size() > 0) { Path[] inPaths = new Path[mainReads.size()]; int i = 0; for (ColFile file : mainReads) { inPaths[i++] = new Path(file.getPath()); Object myProto = file.getPrototype(); if (myProto == null) { errors.add(new PhaseError("Files need non-null prototypes " + file)); } else if (proto != null) { if (myProto.getClass() != proto.getClass()) { errors.add(new PhaseError("Inconsistent prototype classes for inputs: " + myProto.getClass() + " vs " + proto.getClass() + " for " + file)); } } else { proto = myProto; } } AvroInputFormat.setInputPaths(conf, inPaths); if (mapin == null) { if (proto == null) { errors.add(new PhaseError("Undefined input format")); } else { mapin = getSchema(proto); mapInClass = proto.getClass(); } } mainReads.get(0).setupInput(conf); if (conf.get("mapred.input.format.class") == null) conf.setInputFormat(AvroInputFormat.class); } Schema mapValueSchema = null; try { //TODO: handle cases beyond Object where input isn't defined if (mapOutClass == null || mapOutClass == Object.class) { assert mapperClass == null; if (proto != null) { mapOutClass = proto.getClass(); mapValueSchema = getSchema(proto); } else { // not available - try to get it from the reducer if (reducerClass == null) { mapOutClass = reduceOutClass; mapValueSchema = getSchema(reduceOutClass.newInstance()); } else { // can't get it from reducer input - that's just Iterable String fname = "no input file specified"; if (mainReads != null && mainReads.size() > 0) fname = mainReads.get(0).getPath(); errors.add(new PhaseError( "No input format specified for identity mapper - specify it on input file " + fname)); } } } else { mapValueSchema = getSchema(mapOutClass.newInstance()); } if (mapValueSchema != null) conf.set(MAP_OUT_VALUE_SCHEMA, mapValueSchema.toString()); } catch (Exception e) { errors.add(new PhaseError(e, "Can't create instance of map output class: " + mapOutClass)); } conf.set(MAP_OUT_CLASS, mapOutClass.getName()); conf.set(MAP_IN_CLASS, mapInClass.getName()); // XXX validation! if (proto != null) { conf.set(AvroJob.INPUT_SCHEMA, getSchema(proto).toString()); } else if (mapin != null) { conf.set(AvroJob.INPUT_SCHEMA, mapin.toString()); } else { errors.add(new PhaseError("No map input defined")); } if (groupBy != null || sortBy != null) { conf.set(MAP_OUT_KEY_SCHEMA, group(mapValueSchema, groupBy, sortBy).toString()); } if (groupBy != null) { conf.set(GROUP_BY, groupBy); AvroJob.setOutputMeta(conf, GROUP_BY, groupBy); } if (sortBy != null) { conf.setPartitionerClass(AvroGroupPartitioner.class); conf.set(SORT_BY, sortBy); AvroJob.setOutputMeta(conf, SORT_BY, sortBy); } conf.setMapOutputKeyClass(AvroKey.class); conf.setMapOutputValueClass(AvroValue.class); conf.setOutputKeyComparatorClass(ColKeyComparator.class); conf.setMapperClass(ColHadoopMapper.class); conf.setReducerClass(ColHadoopReducer.class); for (Map.Entry<String, String> entry : textMeta.entrySet()) AvroJob.setOutputMeta(conf, entry.getKey(), entry.getValue()); // add ColAvroSerialization to io.serializations Collection<String> serializations = conf.getStringCollection("io.serializations"); if (!serializations.contains(ColAvroSerialization.class.getName())) { serializations.add(ColAvroSerialization.class.getName()); conf.setStrings("io.serializations", serializations.toArray(new String[0])); } return errors; } public static Schema getSchema(Object proto) { try { Field schemaField = proto.getClass().getField("SCHEMA$"); return (Schema) schemaField.get(null); } catch (NoSuchFieldException e) { // use reflection return ReflectData.get().getSchema(proto.getClass()); } catch (Exception e) { throw new IllegalStateException(e); } } public PhaseError submit() { try { System.out.println("Submitting job:"); System.out.println(getDetail()); // probably should just make a conf right here? for (ColFile file : writes) { file.clearAndPrepareOutput(conf); } if (reads != null) { int i = 0; for (ColFile file : reads) { // record inputs, to allow determination of obsolescence // really we should be recording the transitive closure of dependencies here // to allow determining files that are out of date with respect to their original source inputs // although that could get costly for large numbers of log files... AvroJob.setOutputMeta(conf, "input.file.name." + i, file.getPath()); AvroJob.setOutputMeta(conf, "input.file.mtime." + i, file.getTimestamp(conf)); i++; } } JobClient.runJob(conf); return null; } catch (Throwable t) { System.err.println("Job failure"); t.printStackTrace(); // clean up failed job output for (ColFile file : getOutputs()) { file.delete(conf); } return new PhaseError(t, name); } } public List<ColFile> getInputs() { return Collections.unmodifiableList(reads); } public List<ColFile> getOutputs() { return Collections.unmodifiableList(writes); } /** * create a schema containing just the listed comma-separated fields */ public static Schema group(Schema schema, String... fields) { List<String> fieldList = new ArrayList<String>(fields.length); for (String list : fields) { if (list == null) continue; for (String field : list.split(",")) { field = field.trim(); String[] parts = field.split("\\s"); if (parts.length > 0) { fieldList.add(parts[0]); } } } return group(schema, fieldList); } public static Schema group(Schema schema, List<String> fields) { ArrayList<Schema.Field> fieldList = new ArrayList<Schema.Field>(fields.size()); StringBuilder builder = new StringBuilder(); String missing = null; Set<String> held = new TreeSet<String>(); for (String fieldname : fields) { if (held.contains(fieldname)) continue; held.add(fieldname); Schema.Field field = schema.getField(fieldname.trim()); if (field == null) { if (missing == null) { missing = "Invalid group by/sort by - fields not in map output record are: "; } else { missing += ", "; } missing += fieldname.trim(); continue; } Schema.Field copy = new Schema.Field(fieldname, field.schema(), field.doc(), field.defaultValue(), field.order()); fieldList.add(copy); builder.append('_'); builder.append(fieldname); } if (missing != null) { throw new IllegalArgumentException(missing); } schema = Schema.createRecord(schema.getName() + "_proj" + builder.toString(), "generated", schema.getNamespace(), false); schema.setFields(fieldList); return schema; } public String getSummary() { return "mapper " + getMapName() + " reading " + phaseReads() + " reducer " + getReduceName(); } private String getReduceName() { return reducers == null || reducers[0] == null ? "identity" : reducers[0].getName(); } private String getMapName() { return mappers == null || mappers[0] == null ? "identity" : mappers[0].getName(); } private String getDetail() { return String.format("map: %s\nreduce: %s\nreading: %s\nwriting: %s\ngroup by:%s%s", getMapName(), getReduceName(), phaseReads(), mainWrites.get(0).getPath(), groupBy, (sortBy == null ? "" : "\nsort by:" + sortBy)); } private String phaseReads() { StringBuilder reading = new StringBuilder(); for (ColFile read : mainReads) { if (reading.length() > 0) reading.append(','); reading.append(read.getPath()); } return reading.toString(); } }