Java tutorial
package com.ebay.erl.mobius.core.datajoin; import java.io.DataInput; import java.io.IOException; import java.util.Arrays; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.util.ReflectionUtils; import com.ebay.erl.mobius.core.ConfigureConstants; import com.ebay.erl.mobius.core.model.Tuple; import com.ebay.erl.mobius.core.model.TupleColumnComparator; import com.ebay.erl.mobius.core.sort.Sorter; import com.ebay.erl.mobius.util.SerializableUtil; import com.ebay.erl.mobius.util.Util; /** * <p> * This product is licensed under the Apache License, Version 2.0, * available at http://www.apache.org/licenses/LICENSE-2.0. * * This product contains portions derived from Apache hadoop which is * licensed under the Apache License, Version 2.0, available at * http://hadoop.apache.org. * * 2007 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan * */ @SuppressWarnings("unchecked") public class DataJoinKey extends Tuple { // add 2 digit prefix to ensure when deserialize it from byte arrays, // we preserve the order, because when {@link Tuple} serialize itself, // it iterates column names (in natural order) one by one to serialize // the values of the columns. public static String ACUTAL_KEY = "00_MOBIUS_KEY"; public static String DATASET_ID = "01_MOBIUS_DATASETID"; //public static String SORT_KEYWORD_FIELDNAME = "02_MOBIUS_SORT_KEYWORD"; //public static String SORT_COMPARATOR_FIELDNAME = "03_MOBIUS_SORT_COMPARATOR"; // to be called by Hadoop on org.apache.hadoop.mapred.JobConf.getOutputKeyComparator public DataJoinKey() { } public DataJoinKey(Byte datasetID, WritableComparable<?> key) { set(datasetID, key, null, null); } public DataJoinKey(Byte datasetID, WritableComparable<?> key, WritableComparable<?> sortKeyword, Class<?> sortComparator) { set(datasetID, key, sortKeyword, sortComparator); } public void set(Byte datasetID, WritableComparable<?> key, WritableComparable<?> sortKeyword, Class<?> sortComparator) { this.put(ACUTAL_KEY, key); this.put(DATASET_ID, datasetID.byteValue()); //this.put(SORT_KEYWORD_FIELDNAME, sortKeyword==null?NullWritable.get():sortKeyword); //this.put(SORT_COMPARATOR_FIELDNAME, sortComparator==null?Class.class.getName():sortComparator.getName()); } @Override public void readFields(DataInput in) throws IOException { super.readFields(in); // ordering matters this.setSchema( new String[] { ACUTAL_KEY, DATASET_ID/*, SORT_KEYWORD_FIELDNAME, SORT_COMPARATOR_FIELDNAME*/ }); } public WritableComparable getKey() { return (WritableComparable<?>) this.get(ACUTAL_KEY); } public Byte getDatasetID() { return this.getByte(DATASET_ID); } public WritableComparable getSortKeyword() { //return (WritableComparable)this.get(SORT_KEYWORD_FIELDNAME); return null; } public Class getSortComparator() { //return Util.getClass(this.getString(SORT_COMPARATOR_FIELDNAME)); return null; } private static Sorter[] _SORTERS; private Sorter[] getSorter() { if (_SORTERS == null) { if (this.conf == null || this.conf.get(ConfigureConstants.SORTERS, "").isEmpty()) { _SORTERS = new Sorter[0]; } else { try { _SORTERS = (Sorter[]) SerializableUtil .deserializeFromBase64(this.conf.get(ConfigureConstants.SORTERS), conf); } catch (IOException e) { throw new RuntimeException("Cannot deserialize sorters from :[" + this.conf.get(ConfigureConstants.SORTERS) + "] using Base64 decoder.", e); } } } return _SORTERS; } @Override public int compareTo(Tuple other) { WritableComparable<?> key = (WritableComparable<?>) other.get(ACUTAL_KEY); int cmp = _COLUMN_COMPARATOR.compareKey(this.getKey(), key, this.getSorter(), this.conf); if (cmp != 0) return cmp; cmp = getDatasetID().compareTo(other.getByte(DATASET_ID)); if (cmp != 0) return cmp; return 0; } @Override public int compare(Tuple t1, Tuple t2) { if (t1 instanceof DataJoinKey && t2 instanceof DataJoinKey) { int result = t1.compareTo(t2); return result; } else { return super.compare(t1, t2); } } public static class Comparator extends WritableComparator { public Comparator() { super(DataJoinKey.class); } @Override public int compare(WritableComparable a, WritableComparable b) { if (a instanceof DataJoinKey && b instanceof DataJoinKey) { return ((DataJoinKey) a).getKey().compareTo(((DataJoinKey) b).getKey()); } return super.compare(a, b); } } private WritableComparable getKey(byte type, DataInputBuffer input) throws IOException { if (type == Tuple.NULL_WRITABLE_TYPE) return NullWritable.get(); else if (type == Tuple.TUPLE_TYPE) { Tuple newTuple = new Tuple(); newTuple.readFields(input); return newTuple; } else { WritableComparable w = (WritableComparable) ReflectionUtils.newInstance(Util.getClass(input.readUTF()), conf); w.readFields(input); return w; } } private final TupleColumnComparator _COLUMN_COMPARATOR = new TupleColumnComparator(); @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { DataInputBuffer d1 = new DataInputBuffer(); d1.reset(b1, s1, l1); DataInputBuffer d2 = new DataInputBuffer(); d2.reset(b2, s2, l2); int _compare_result = Integer.MAX_VALUE; try { // the comparing ordering: // 1. DataJoinKey#KEY_FIELDNAME // 2. DataJoinKey#DATASET_ID_FIELDNAME // 3. DataJoinKey#SORT_KEYWORD_FIELDNAME - removed // 4. DataJoinKey#SORT_COMPARATOR_FIELDNAME - removed // read number of columns from the two tuple, // but there is no need to compare the length // of columns, we just read the values. d1.readInt(); d2.readInt(); ////////////////////////////////////////////////////////// // compare KEY, values from DataJoinKey#KEY_FIELDNAME // KEY represents the actual key user specified /////////////////////////////////////////////////////////// byte type1 = d1.readByte(); byte type2 = d2.readByte(); _COLUMN_COMPARATOR.setType(type1, type2); // writable, check if they are Tuple or NullWritable if (type1 == Tuple.NULL_WRITABLE_TYPE && type2 == Tuple.NULL_WRITABLE_TYPE) { // consider equal, do nothing _compare_result = 0; } else if (type1 == Tuple.TUPLE_TYPE && type2 == Tuple.TUPLE_TYPE) { // both are Tuple Tuple k1 = (Tuple) getKey(type1, d1); Tuple k2 = (Tuple) getKey(type2, d2); _compare_result = _COLUMN_COMPARATOR.compareKey(k1, k2, this.getSorter(), conf); } else { // DataJoinKey only support NullWritable and Tuple for the DataJoinKey#KEY_FIELDNAME throw new IllegalArgumentException( "Cannot compare " + Tuple.getTypeString(type1) + " and " + Tuple.getTypeString(type2)); } // if they are not the same, these two records should go to // different reducer, or different reduce iteration. if (_compare_result != 0) return _compare_result; ////////////////////////////////////////////////////////////////////////// // compare DATASET_ID, values from DataJoinKey#DATASET_ID_FIELDNAME, // at this point, the keys are the same, they should go to the same // reducer, we need to make sure the values from DATASET1 always come // before DATASET2, so we need to compare the DATASET_ID here. ////////////////////////////////////////////////////////////////////////// try { _COLUMN_COMPARATOR.setType(d1.readByte(), d2.readByte()); _compare_result = _COLUMN_COMPARATOR.compare(d1, d2, this.conf); if (_compare_result != 0) return _compare_result; } catch (IOException e) { byte[] b = new byte[l1]; for (int i = 0; i < l1; i++) { b[i] = b1[s1 + i]; } System.err.println(Arrays.toString(b)); System.err.println("type1:" + type1 + ", type2:" + type2); throw e; } return 0; } catch (IOException e) { throw new RuntimeException(e); } } }