Java tutorial
/* * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.schemadict; import java.io.File; import java.io.IOException; import java.io.DataInput; import java.io.DataOutput; import java.util.Iterator; import java.util.HashMap; import java.util.TreeMap; import java.util.HashSet; import java.util.TreeSet; import java.util.Set; import java.util.SortedSet; import java.util.Hashtable; import java.util.Map; import java.lang.Math; import java.util.List; import java.util.Arrays; import java.util.ArrayList; import java.nio.ByteBuffer; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.file.DataFileReader; import org.apache.avro.reflect.ReflectDatumReader; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericArray; import org.apache.avro.generic.GenericFixed; import org.apache.avro.generic.GenericRecord; import org.apache.avro.util.Utf8; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.UTF8; import org.apache.hadoop.io.Text; /******************************************** * The SchemaStatistical Summary object is designed to mirror the structure of an input Schema. * In addition to the name and type information associated with a Schema object, it keeps statistical data * about observed actual data values that correspond to each Schema element. * * This class is intended to be used in the following way: * 1) Instantiate a SchemaStatisticalSummary object with a preexisting Schema. * 2) For each GenericData item that exhibits the Schema, call SchemaStatisticalSummary.addData(GenericData). This is * designed to be called multiple times. * 3) Once all the desired data has been added, call finalizeStatistics(). * 4) The resulting finalized SchemaStatisticalSummary object can then be compared to other SchemaStatisticalSummary objects with the measureDistance() function. * ********************************************/ public class SchemaStatisticalSummary implements Writable { final static byte MAGIC = (byte) 0xa1; final static byte VERSION = (byte) 1; final static int MAX_SUMMARY_SAMPLES = 50; final static double MATCHCOST_TYPE_CLASH = 1 * 10 * 1000; final static double MATCHCOST_CREATE = 1 * 1000; final static double MATCHCOST_DELETE = 1 * 1000; final static short ARRAY_NODE = 1; final static short BOOLEAN_NODE = 2; final static short BYTES_NODE = 3; final static short DOUBLE_NODE = 4; final static short ENUM_NODE = 5; final static short FIXED_NODE = 6; final static short FLOAT_NODE = 7; final static short INT_NODE = 8; final static short LONG_NODE = 9; final static short MAP_NODE = 10; final static short NULL_NODE = 11; final static short RECORD_NODE = 12; final static short STRING_NODE = 13; final static short UNION_NODE = 14; ///////////////////////////////////////////////// // Inner classes ///////////////////////////////////////////////// /***************************************************** * SummaryNode is a generic statistical summary object for a given elt in the * hierarchy. A single tuple in the source data may yield a number of nested * SummaryNodes, all rooted at a GenericRecord. * * The hierarchy is instantiated by examining the schema. Each new data item * results in a call to SummaryNode.addData(), in which the data item is passed in. ******************************************************/ abstract class SummaryNode implements Cloneable { SummaryNode parent = null; int preorderIdx; int numData; String docStr = ""; public SummaryNode() { } public SummaryNode(String docStr) { this.docStr = docStr; } ////////////////////////////////////////// // Methods for constructing the summary-node tree ////////////////////////////////////////// public void addData(Object obj) { if (obj instanceof Boolean) { this.addData((Boolean) obj); } else if (obj instanceof GenericArray) { this.addData((GenericArray) obj); } else if (obj instanceof Double) { this.addData((Double) obj); } else if (obj instanceof Float) { this.addData((Float) obj); } else if (obj instanceof GenericFixed) { this.addData((GenericFixed) obj); } else if (obj instanceof Integer) { this.addData((Integer) obj); } else if (obj instanceof Long) { this.addData((Long) obj); } else if (obj instanceof Map) { this.addData((Map) obj); } else if (obj instanceof ByteBuffer) { this.addData((ByteBuffer) obj); } else if (obj instanceof GenericRecord) { this.addData((GenericRecord) obj); } else if (obj instanceof Utf8) { this.addData((Utf8) obj); } else if (obj instanceof String) { this.addData((String) obj); } } // Overridden on per-subclass basis. public void addData(Boolean b) { }; public void addData(GenericArray g) { }; public void addData(Double d) { }; public void addData(Float f) { }; public void addData(Integer i) { }; public void addData(Long l) { }; public void addData(Map m) { }; public void addData(ByteBuffer bb) { }; public void addData(GenericRecord g) { }; public void addData(Utf8 u) { }; public void addData(String s) { }; /////////////////////////////////////////////// // Tree-manipulation and info methods /////////////////////////////////////////////// /** * How many nodes in this subtree? */ public int size() { int total = 0; for (SummaryNode child : children()) { total += child.size(); } return total + 1; } /** * Setters/getters */ SummaryNode getParent() { return parent; } void setParent(SummaryNode parent) { this.parent = parent; } public List<SummaryNode> children() { return new ArrayList<SummaryNode>(); } public int preorderCount() { return preorderIdx; } public SummaryNode parent() { return parent; } /** * Dealing with paths and node orderings */ public int computePreorder(int lastIdx) { lastIdx++; this.preorderIdx = lastIdx; for (SummaryNode child : children()) { lastIdx = child.computePreorder(lastIdx); child.setParent(this); } return lastIdx; } void preorder(List<SummaryNode> soFar) { soFar.add(this); for (SummaryNode child : children()) { child.preorder(soFar); } } public List<SummaryNode> preorder() { List<SummaryNode> l = new ArrayList<SummaryNode>(); preorder(l); return l; } public List<SummaryNode> pathToRoot() { List<SummaryNode> path = new ArrayList<SummaryNode>(); SummaryNode cur = this; while (cur != null) { path.add(cur); cur = cur.getParent(); } return path; } public List<SummaryNode> getLastNodeOnPath() { List<SummaryNode> path = new ArrayList<SummaryNode>(); SummaryNode cur = this; while (cur != null) { path.add(cur); cur = cur.getParent(); } return path; } /** * Useful in testing whether two fields are referring to the same thing. * Levenshtein edit distance is great, but we would like a value that ranges 0..1. * * To compute this, note that the LD is at least abs(len(s1)-len(s2)). It is also at * most max(len(s1), len(s2)). So we normalize LD by that range. */ double normalizedLevenshteinDistance(String s1, String s2) { int rawLD = computeLevenshteinDistance(s1, s2); int range = Math.abs(Math.max(s1.length(), s2.length()) - Math.abs(s1.length() - s2.length())); return (rawLD / (1.0 * range)); } /** * The classic string edit distance algorithm rides again. */ int computeLevenshteinDistance(String s1, String s2) { int s1Length = s1.length(); int s2Length = s2.length(); int s1pos; int s2pos; if (s1Length == 0) { return s2Length; } if (s2Length == 0) { return s1Length; } int d[][] = new int[s1Length + 1][]; for (int i = 0; i <= s1Length; i++) { d[i] = new int[s2Length + 1]; } for (int i = 0; i <= s1Length; i++) { d[i][0] = i; } for (int j = 0; j <= s2Length; j++) { d[0][j] = j; } for (int i = 1; i <= s1Length; i++) { char s1Char = s1.charAt(i - 1); for (int j = 1; j <= s2Length; j++) { char s2Char = s2.charAt(j - 1); int cost = 0; if (s1Char != s2Char) { cost = 1; } d[i][j] = Math.min(d[i - 1][j] + 1, Math.min(d[i][j - 1] + 1, d[i - 1][j - 1] + cost)); } } return d[s1Length][s2Length]; } /////////////////////////////////////////////// // Methods for string representation /////////////////////////////////////////////// /** * Helper method for rendering a string version of the data */ String prefixString(int prefix) { StringBuffer buf = new StringBuffer(); for (int i = 0; i < prefix; i++) { buf.append(" "); } return buf.toString(); } /** * Render a string version of the data */ public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + "\n"; } public abstract String getTypeDesc(); /** * Find the right node and obtain a description of it. */ public abstract String getDesc(boolean verbose); public String getDesc(int nodeid) { if (nodeid == preorderIdx) { return getDesc(false); } else { for (SummaryNode child : children()) { String desc = child.getDesc(nodeid); if (desc != null) { return desc; } } } return null; } public String getLabel(int nodeid) { if (nodeid == preorderIdx) { return getLabel(); } else { for (SummaryNode child : children()) { String label = child.getLabel(nodeid); if (label != null) { return label; } } } return null; } public String getTypeDesc(int nodeid) { if (nodeid == preorderIdx) { return getTypeDesc(); } else { for (SummaryNode child : children()) { String typedesc = child.getTypeDesc(nodeid); if (typedesc != null) { return typedesc; } } } return null; } public String getDocStr(int nodeid) { if (nodeid == preorderIdx) { return docStr; } else { for (SummaryNode child : children()) { String docstr = child.getDocStr(nodeid); if (docstr != null) { return docstr; } } } return null; } /** * Find the "label" for the current node. Since the top-level element in the * NodeSummary hierarchy is a record, we know that every element has a label. * The getLabel() function goes up the tree to the root, constructing the * dotted label sequence all the way. */ public String getLabel() { if (parent != null) { return parent.getLabel("", this); } else { return "<root>"; } } public String getLabel(String labelSoFar, SummaryNode src) { if (parent != null) { return parent.getLabel(labelSoFar, this); } else { return labelSoFar; } } /////////////////////////////////////////////// // Cost functions for schema matching /////////////////////////////////////////////// /** * Figure out basic normalized string edit distance to * see if the schema labels match. If 'useAttributeLabels' * is set to false, then this distance is always zero. */ double computeSchemaLabelDistance(String l1, String l2) { if (!useAttributeLabels) { return 0; } else { if (l1.indexOf(".") >= 0) { l1 = l1.substring(l1.lastIndexOf(".") + 1); } if (l2.indexOf(".") >= 0) { l2 = l2.substring(l2.lastIndexOf(".") + 1); } return normalizedLevenshteinDistance(l1, l2); } } /** * The default non-type-specific way of performing schema matching is to * just compare the attribute labels. We can also examine data distributions, * but this is only possible in the subclasses' overriding transformCost() methods. */ public double transformCost(SummaryNode other) { if (this.getClass() == other.getClass()) { // Examine the field name for a schema-label distance return computeSchemaLabelDistance(this.getLabel(), other.getLabel()); } else { return MATCHCOST_TYPE_CLASH; } } public double deleteCost() { return MATCHCOST_DELETE; } public double createCost() { return MATCHCOST_CREATE; } /////////////////////////////////////////////// // Serialization/deserialization /////////////////////////////////////////////// public abstract void write(DataOutput out) throws IOException; public abstract void readFields(DataInput in) throws IOException; } /***************************************************** * Store statistical summary of observed arrays. Basically, store length information and # times seen. ****************************************************/ class ArraySummaryNode extends SummaryNode { int totalSize; SummaryNode eltSummary; public ArraySummaryNode() { } public ArraySummaryNode(SummaryNode eltSummary, String docStr) { super(docStr); this.eltSummary = eltSummary; } /** */ public void addData(GenericArray data) { numData++; totalSize += data.size(); for (Iterator it = data.iterator(); it.hasNext();) { eltSummary.addData(it.next()); } } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", avgSize: " + (totalSize / (1.0 * numData)) + "\n" + eltSummary.dumpSummary(prefix + 2); } public String getTypeDesc() { return "ARRAY"; } public String getDesc(boolean verbose) { String desc = "ARRAY"; if (verbose) { desc += "(numData: " + numData + ", avgSize: " + (totalSize / (1.0 * numData)) + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(ARRAY_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(totalSize); eltSummary.write(out); } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.totalSize = in.readInt(); this.eltSummary = readAndCreate(in); } } /***************************************************** * Store statistical summary of observed boolean field. Store # times seen and distribution true vs false ****************************************************/ class BooleanSummaryNode extends SummaryNode { int numTrue; int numFalse; public BooleanSummaryNode() { } public BooleanSummaryNode(String docStr) { super(docStr); } public void addData(Boolean b) { numData++; if (b.booleanValue()) { numTrue++; } else { numFalse++; } } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", numTrue: " + numTrue + ", numFalse: " + numFalse + "\n"; } public String getTypeDesc() { return "BOOLEAN"; } public String getDesc(boolean verbose) { String desc = "BOOLEAN"; if (verbose) { desc += "(numData: " + numData + ", numTrue: " + numTrue + ", numFalse: " + numFalse + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(BOOLEAN_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(numTrue); out.writeInt(numFalse); } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.numTrue = in.readInt(); this.numFalse = in.readInt(); } } /***************************************************** * Store statistical summary of observed Bytes field. Store # times seen and # bytes seen. ****************************************************/ class BytesSummaryNode extends SummaryNode { int totalSize = 0; public BytesSummaryNode() { } public BytesSummaryNode(String docStr) { super(docStr); } public void addData(ByteBuffer bb) { numData++; totalSize += bb.remaining(); } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", totalSize: " + totalSize + "\n"; } public String getTypeDesc() { return "BYTES"; } public String getDesc(boolean verbose) { String desc = "BYTES"; if (verbose) { desc += "(numData: " + numData + ", totalSize: " + totalSize + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(BYTES_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(totalSize); } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.totalSize = in.readInt(); } } /***************************************************** * Store statistical summary of observed Double field. Store # times seen and total value ****************************************************/ class DoubleSummaryNode extends SummaryNode { double total; public DoubleSummaryNode() { } public DoubleSummaryNode(String docStr) { super(docStr); } public void addData(Double d) { numData++; total += d.doubleValue(); } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n"; } public String getTypeDesc() { return "DOUBLE"; } public String getDesc(boolean verbose) { String desc = "DOUBLE"; if (verbose) { desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(DOUBLE_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeDouble(total); } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.total = in.readDouble(); } } /***************************************************** * Store statistical summary of observed Enumerated Type field. Store # times seen and statistics on how often * each enum-value is seen. ****************************************************/ class EnumSummaryNode extends SummaryNode { String name; Map<String, Integer> symbolCounts = new HashMap<String, Integer>(); public EnumSummaryNode() { } public EnumSummaryNode(String name, List<String> symbols, String docStr) { super(docStr); this.name = name; for (String symbol : symbols) { this.symbolCounts.put(symbol, 1); } } public void addData(String s) { this.symbolCounts.put(s, symbolCounts.get(s) + 1); } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { StringBuffer buf = new StringBuffer(); buf.append(prefixString(prefix) + "numData: " + numData + " =>\n"); for (String symbol : symbolCounts.keySet()) { buf.append(prefixString(prefix + 2) + symbol + ": " + symbolCounts.get(symbol) + "\n"); } return buf.toString(); } public String getTypeDesc() { return "ENUM"; } public String getDesc(boolean verbose) { String desc = "ENUM"; if (verbose) { desc += "(numData: " + numData + ", numSymbols: " + symbolCounts.size() + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(ENUM_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(symbolCounts.size()); for (String symbol : symbolCounts.keySet()) { new Text(symbol).write(out); out.writeInt(symbolCounts.get(symbol)); } } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); symbolCounts = new HashMap<String, Integer>(); int numElts = in.readInt(); for (int i = 0; i < numElts; i++) { Text symbol = new Text(); symbol.readFields(in); Integer count = in.readInt(); symbolCounts.put(symbol.toString(), count); } } } /***************************************************** * Store statistical summary of observed GenericFixed field. Store # times seen and byte length information. Eventually, * store info on the byte content, too. ****************************************************/ class FixedSummaryNode extends SummaryNode { String name; int size; int total; public FixedSummaryNode() { } public FixedSummaryNode(String name, int size, String docStr) { super(docStr); this.name = name; this.size = size; this.total = 0; } public void addData(GenericFixed data) { byte d[] = data.bytes(); total += d.length; numData++; } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "size: " + size + ", total: " + total + ", numData: " + numData; } public String getTypeDesc() { return "FIXED"; } public String getDesc(boolean verbose) { String desc = "FIXED"; if (verbose) { desc += "(numData: " + numData + ", size: " + size + ", total: " + total + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(FIXED_NODE); new Text(name).write(out); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(size); out.writeInt(total); } public void readFields(DataInput in) throws IOException { this.name = Text.readString(in); this.docStr = UTF8.readString(in); this.size = in.readInt(); this.total = in.readInt(); } } /***************************************************** * Store statistical summary of observed Float field. Store # times seen and total value ****************************************************/ class FloatSummaryNode extends SummaryNode { float total; public FloatSummaryNode() { } public FloatSummaryNode(String docStr) { super(docStr); } public void addData(Float f) { numData++; total += f.floatValue(); } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n"; } public String getTypeDesc() { return "FLOAT"; } public String getDesc(boolean verbose) { String desc = "FLOAT"; if (verbose) { desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(FLOAT_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeFloat(total); } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.total = in.readFloat(); } } /***************************************************** * Store statistical summary of observed Integer field. * Store total value, num data elements, and a sample of actual data elts ****************************************************/ class IntegerSummaryNode extends SummaryNode { int total; List<Integer> samples = new ArrayList<Integer>(); public IntegerSummaryNode() { } public IntegerSummaryNode(String docStr) { super(docStr); } public void addData(Integer i) { numData++; total += i.intValue(); if (samples.size() < MAX_SUMMARY_SAMPLES) { samples.add(i); } } /////////////////////////////////////////////// // Cost functions for schema matching /////////////////////////////////////////////// public double transformCost(SummaryNode other) { if (this.getClass() == other.getClass()) { double schemaLabelDistance = computeSchemaLabelDistance(this.getLabel(), other.getLabel()); double klDivergence = computeSampleKLDivergence((IntegerSummaryNode) other); return schemaLabelDistance + klDivergence; } else { return MATCHCOST_TYPE_CLASH; } } /** * This computes the Kullback-Leibler divergence between two int distributions. It * measures how much the two integer distributions differ. Useful for testing whether * they should be matched. * * Assumes the two distributions are gaussians. */ public double computeSampleKLDivergence(IntegerSummaryNode other) { double mean1 = total / (1.0 * numData); double mean2 = other.total / (1.0 * other.numData); double stddev1 = computeStddev(); double stddev2 = other.computeStddev(); double variance1 = Math.pow(stddev1, 2); double variance2 = Math.pow(stddev2, 2); return Math.log(stddev2 / stddev1) + ((variance1 + Math.pow(mean1 - mean2, 2)) / (2 * Math.pow(variance2, 2))) - 0.5; } /** * Compute the standard deviation of the distribution of integers in this summary node. * Note that if the sample is smaller than the genuine data, we take the * "sample standard deviation", not the true stddev. */ public double computeStddev() { double mean = total / (1.0 * numData); double total = 0; for (Integer sample : samples) { total += Math.pow(sample.intValue() - mean, 2); } double normalizer = 1 / (1.0 * numData); if (samples.size() < numData) { // This here's what makes the "sample std deviation" in case we're not // looking at the full dataset. normalizer = 1 / (1.0 * (numData - 1)); } double variance = normalizer * total; return Math.sqrt(variance); } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n"; } public String getTypeDesc() { return "INT"; } public String getDesc(boolean verbose) { String desc = "INT"; if (verbose) { desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(INT_NODE); out.writeInt(numData); UTF8.writeString(out, (docStr == null) ? "" : docStr); out.writeInt(total); out.writeInt(samples.size()); for (Integer sample : samples) { out.writeInt(sample.intValue()); } } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.total = in.readInt(); this.samples.clear(); int numSamples = in.readInt(); for (int i = 0; i < numSamples; i++) { this.samples.add(in.readInt()); } } } /***************************************************** * Store statistical summary of observed Long field. Store # times seen and total value ****************************************************/ class LongSummaryNode extends SummaryNode { long total; public LongSummaryNode() { } public LongSummaryNode(String docStr) { super(docStr); } public void addData(Long l) { numData++; total += l.longValue(); } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n"; } public String getTypeDesc() { return "LONG"; } public String getDesc(boolean verbose) { String desc = "LONG"; if (verbose) { desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(LONG_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeLong(total); } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.total = in.readLong(); } } /***************************************************** * Store statistical summary of observed Map field. Store # times seen and track data for each labeled key-pair. ****************************************************/ class MapSummaryNode extends SummaryNode { Schema modelS; HashMap<Utf8, SummaryNode> stats = new HashMap<Utf8, SummaryNode>(); public MapSummaryNode() { } public MapSummaryNode(Schema modelS, String docStr) { super(docStr); this.modelS = modelS; } public void addData(Map m) { numData++; Iterator it = m.keySet().iterator(); while (it.hasNext()) { Utf8 key = (Utf8) it.next(); SummaryNode s = stats.get(key); if (s == null) { s = buildStructure(modelS, modelS.getDoc()); stats.put(key, s); } s.addData(m.get(key)); } } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { StringBuffer buf = new StringBuffer(); buf.append(prefixString(prefix) + "+------------------------------------------+\n"); buf.append(prefixString(prefix) + "numData: " + numData + "\n"); for (Utf8 key : stats.keySet()) { SummaryNode s = stats.get(key); buf.append(prefixString(prefix) + key + " =>\n" + s.dumpSummary(prefix + 2)); } buf.append(prefixString(prefix) + "+------------------------------------------+\n"); return buf.toString(); } public String getTypeDesc() { return "MAP"; } public String getDesc(boolean verbose) { String desc = "MAP"; if (verbose) { desc += "(numData: " + numData + ", numSymbols: " + stats.size() + ")"; } return getLabel() + ": " + desc; } public String getLabel(String labelSoFar, SummaryNode src) { for (Utf8 fname : stats.keySet()) { SummaryNode candidate = stats.get(fname); if (src == candidate) { if (parent != null) { labelSoFar = (labelSoFar.length() > 0) ? fname.toString() + "." + labelSoFar : fname.toString(); return parent.getLabel(labelSoFar, this); } } } return labelSoFar; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(MAP_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(stats.size()); for (Utf8 key : stats.keySet()) { new Text(key.toString()).write(out); stats.get(key).write(out); } } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); int numElts = in.readInt(); for (int i = 0; i < numElts; i++) { Text key = new Text(); key.readFields(in); SummaryNode sn = readAndCreate(in); stats.put(new Utf8(key.toString()), sn); } } } /***************************************************** * Store statistical summary of observed Null field. Just store # times seen. ****************************************************/ class NullSummaryNode extends SummaryNode { public NullSummaryNode() { } public NullSummaryNode(String docStr) { super(docStr); } public void addData() { numData++; } public String getDesc(boolean verbose) { String desc = "NULL"; if (verbose) { desc += "(numData: " + numData + ")"; } return getLabel() + ": " + desc; } public String getTypeDesc() { return "NULL"; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(NULL_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); } } /***************************************************** * Store statistical summary of observed Record field. Store # times seen and then data about sub-elements. ****************************************************/ class RecordSummaryNode extends SummaryNode { String name; Map<String, SummaryNode> recordSummary = new HashMap<String, SummaryNode>(); public RecordSummaryNode() { } public RecordSummaryNode(String name, String docStr) { super(docStr); this.name = name; } public List<SummaryNode> children() { List<SummaryNode> l = new ArrayList<SummaryNode>(); for (String key : recordSummary.keySet()) { l.add(recordSummary.get(key)); } return l; } public void addField(String fname, SummaryNode fn) { recordSummary.put(fname, fn); } public void addData(GenericRecord data) { numData++; for (String fname : recordSummary.keySet()) { recordSummary.get(fname).addData(data.get(fname)); } } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { StringBuffer buf = new StringBuffer(); buf.append(prefixString(prefix) + "+------------------------------------------+\n"); buf.append(prefixString(prefix) + "numData: " + numData + "\n"); for (String fname : recordSummary.keySet()) { buf.append( prefixString(prefix) + fname + " =>\n" + recordSummary.get(fname).dumpSummary(prefix + 2)); } buf.append(prefixString(prefix) + "+------------------------------------------+\n"); return buf.toString(); } public String getTypeDesc() { return "RECORD"; } public String getDesc(boolean verbose) { String desc = "RECORD"; if (verbose) { desc += "(numData: " + numData + ", fields: " + recordSummary.size() + ")"; } return getLabel() + ": " + desc; } public String getLabel(String labelSoFar, SummaryNode src) { for (String fname : recordSummary.keySet()) { SummaryNode candidate = recordSummary.get(fname); if (src == candidate) { labelSoFar = (labelSoFar.length() > 0) ? fname + "." + labelSoFar : fname; if (parent != null) { return parent.getLabel(labelSoFar, this); } else { return "<root>" + "." + labelSoFar; } } } return "<root>" + "." + labelSoFar; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(RECORD_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(recordSummary.size()); for (String fname : recordSummary.keySet()) { new Text(fname).write(out); recordSummary.get(fname).write(out); } } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); int numRecs = in.readInt(); for (int i = 0; i < numRecs; i++) { Text fname = new Text(); fname.readFields(in); SummaryNode sn = readAndCreate(in); recordSummary.put(fname.toString(), sn); } } } /***************************************************** * Store statistical summary of observed String field. Store # times seen and total length of the strings (for now). * Eventually, store info on the String content, too. ****************************************************/ class StringSummaryNode extends SummaryNode { int totalLength; Set<Utf8> observedStrings = new TreeSet<Utf8>(); public StringSummaryNode() { } public StringSummaryNode(String docStr) { super(docStr); } public void addData(Utf8 s) { numData++; totalLength += s.getLength(); observedStrings.add(s); } /////////////////////////////////////////////// // Cost functions for schema matching /////////////////////////////////////////////// public double transformCost(SummaryNode other) { if (this.getClass() == other.getClass()) { double schemaLabelDistance = computeSchemaLabelDistance(this.getLabel(), other.getLabel()); double jaccardSimilarity = computeJaccardSimilarity((StringSummaryNode) other); double jaccardDistance = 1 - jaccardSimilarity; return schemaLabelDistance + jaccardDistance; } else { return MATCHCOST_TYPE_CLASH; } } /** * This is a useful score for determining whether two sets of objects are similar */ public double computeJaccardSimilarity(StringSummaryNode other) { Set<Utf8> larger = (this.numData >= other.numData ? this.observedStrings : other.observedStrings); Set<Utf8> smaller = (this.numData < other.numData ? this.observedStrings : other.observedStrings); int unionSize = larger.size(); if (larger.contains(new Utf8(""))) { unionSize -= 1; } int intersectionSize = 0; for (Utf8 smallElt : smaller) { if (smallElt.length() == 0) { continue; } if (larger.contains(smallElt)) { intersectionSize++; } else { unionSize++; } } if (unionSize == 0) { return 0; } else { return intersectionSize / (1.0 * unionSize); } } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { return prefixString(prefix) + "numData: " + numData + ", avg-len: " + (totalLength / (1.0 * numData)) + "\n"; } public String getTypeDesc() { return "STRING"; } public String getDesc(boolean verbose) { String desc = "STRING"; if (verbose) { desc += "(numData: " + numData + ", avglen: " + (totalLength / (1.0 * numData)) + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(STRING_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(totalLength); out.writeInt(observedStrings.size()); for (Utf8 s : observedStrings) { UTF8.writeString(out, s.toString()); } } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); this.totalLength = in.readInt(); observedStrings.clear(); int numInts = in.readInt(); for (int i = 0; i < numInts; i++) { observedStrings.add(new Utf8(UTF8.readString(in))); } } } /***************************************************** * Store statistical summary of observed Union field. Actually, a Union is not observed directly - we just know * it's a union from the schema. Store # times seen, data on the particular type observed, and statistics on how * often each subtype is seen. ****************************************************/ class UnionSummaryNode extends SummaryNode { Map<Schema.Type, SummaryNode> unionTypes = new HashMap<Schema.Type, SummaryNode>(); Map<Schema.Type, Integer> unionTypeCounts = new HashMap<Schema.Type, Integer>(); public UnionSummaryNode() { } public UnionSummaryNode(String docStr) { super(docStr); } public void addType(Schema.Type t, SummaryNode sn) { if (unionTypes.get(t) == null) { unionTypes.put(t, sn); unionTypeCounts.put(t, 0); } } /** * We need to dispatch the object to the right element stored in 'unionTypes' */ public void addData(Object obj) { Schema.Type t = Schema.Type.ARRAY; if (obj instanceof GenericArray) { t = Schema.Type.ARRAY; } else if (obj instanceof Boolean) { t = Schema.Type.BOOLEAN; } else if (obj instanceof ByteBuffer) { t = Schema.Type.BYTES; } else if (obj instanceof Double) { t = Schema.Type.DOUBLE; } else if (obj instanceof String) { t = Schema.Type.ENUM; } else if (obj instanceof GenericFixed) { t = Schema.Type.FIXED; } else if (obj instanceof Float) { t = Schema.Type.FLOAT; } else if (obj instanceof Integer) { t = Schema.Type.INT; } else if (obj instanceof Long) { t = Schema.Type.LONG; } else if (obj instanceof Map) { t = Schema.Type.MAP; } else if (obj instanceof GenericRecord) { t = Schema.Type.RECORD; } else if (obj instanceof Utf8) { t = Schema.Type.STRING; } unionTypes.get(t).addData(obj); Integer c = unionTypeCounts.get(t); if (c == null) { unionTypeCounts.put(t, 1); } else { unionTypeCounts.put(t, c.intValue() + 1); } } ///////////////////////////// // String representation ///////////////////////////// public String dumpSummary(int prefix) { StringBuffer buf = new StringBuffer(); for (Schema.Type t : unionTypes.keySet()) { buf.append(prefixString(prefix) + "unionType: " + t + " =>\n"); buf.append(unionTypes.get(t).dumpSummary(prefix + 2)); } return buf.toString(); } public String getTypeDesc() { return "UNION"; } public String getDesc(boolean verbose) { String desc = "UNION"; if (verbose) { desc += "(numData: " + numData + ", numtypes: " + unionTypes.size() + ")"; } return getLabel() + ": " + desc; } ///////////////////////////// // Serialize/deserialize ///////////////////////////// public void write(DataOutput out) throws IOException { out.writeShort(UNION_NODE); out.writeInt(numData); UTF8.writeString(out, docStr == null ? "" : docStr); out.writeInt(unionTypes.size()); for (Schema.Type t : unionTypes.keySet()) { new Text(t.toString()).write(out); out.writeInt(unionTypeCounts.get(t)); unionTypes.get(t).write(out); } } public void readFields(DataInput in) throws IOException { this.numData = in.readInt(); this.docStr = UTF8.readString(in); int numTypes = in.readInt(); for (int i = 0; i < numTypes; i++) { Text tLabel = new Text(); tLabel.readFields(in); Schema.Type t = Schema.Type.valueOf(tLabel.toString()); int typeCount = in.readInt(); SummaryNode sn = readAndCreate(in); unionTypes.put(t, sn); unionTypeCounts.put(t, typeCount); } } } /*************************************** * Op is used to track mapping results ***************************************/ class PreviousChoice extends SchemaMappingOp { Hashtable<String, List<SchemaMappingOp>> h; String label; public PreviousChoice(Hashtable<String, List<SchemaMappingOp>> h, int i, int j) { this.h = h; this.label = "" + i + "-" + j; } public PreviousChoice(Hashtable<String, List<SchemaMappingOp>> h, int p1, int p2, int p3, int p4, int p5, int p6) { this.h = h; this.label = "" + p1 + "-" + p2 + "-" + p3 + "-" + p4 + "-" + p5 + "-" + p6; } public List<SchemaMappingOp> getOps() { List<SchemaMappingOp> ops = h.get(label); if (ops == null) { ops = new ArrayList<SchemaMappingOp>(); } return ops; } public String toString() { return "Previous! " + label; } } ///////////////////////////////////////////////// // Members ///////////////////////////////////////////////// SummaryNode root = null; boolean useAttributeLabels = true; String datasetLabel = ""; ///////////////////////////////////////////////// // Constructors, initializers ///////////////////////////////////////////////// public SchemaStatisticalSummary() throws IOException { } public SchemaStatisticalSummary(String datasetLabel) throws IOException { this.datasetLabel = datasetLabel; } public void setUseAttributeLabels(boolean useAttributeLabels) { this.useAttributeLabels = useAttributeLabels; } /** * Create the statistical summary object from data. */ public Schema createSummaryFromData(File f) throws IOException { DataFileReader in = new DataFileReader(f, new GenericDatumReader()); try { Schema s = in.getSchema(); // // There has to be at least one data element for us to infer anything meaningful // Iterator it = in.iterator(); if (!it.hasNext()) { throw new IOException("No contents"); } // // We can only infer schemas from top-level records, not Fixeds or Arrays. // Object firstRecord = it.next(); if (firstRecord instanceof GenericFixed || firstRecord instanceof GenericArray) { throw new IOException("Not a top-level record"); } // We assume the passed-in top-level Schema always represents a Record. if (s.getType() != Schema.Type.RECORD) { throw new IOException("Passed-in top-level Schema instance must be of type Schema.Type.RECORD"); } this.root = buildStructure(s, "ROOT"); // // Iterate through all records and collect statistics on each Schema field. // List<Schema.Field> fields = s.getFields(); GenericRecord cur = (GenericRecord) firstRecord; int counter = 0; do { this.root.addData(cur); counter++; if (it.hasNext()) { cur = (GenericRecord) it.next(); } else { cur = null; } } while (cur != null); this.root.computePreorder(-1); return s; } finally { in.close(); } } /** * This function reads in data and instantiates the SummaryNode hierarchy. */ public SummaryNode readAndCreate(DataInput in) throws IOException { short nodeType = in.readShort(); SummaryNode sn = null; switch (nodeType) { case ARRAY_NODE: { sn = new ArraySummaryNode(); break; } case BOOLEAN_NODE: { sn = new BooleanSummaryNode(); break; } case BYTES_NODE: { sn = new BytesSummaryNode(); break; } case DOUBLE_NODE: { sn = new DoubleSummaryNode(); break; } case ENUM_NODE: { sn = new EnumSummaryNode(); break; } case FIXED_NODE: { sn = new FixedSummaryNode(); break; } case FLOAT_NODE: { sn = new FloatSummaryNode(); break; } case INT_NODE: { sn = new IntegerSummaryNode(); break; } case LONG_NODE: { sn = new LongSummaryNode(); break; } case MAP_NODE: { sn = new MapSummaryNode(); break; } case NULL_NODE: { sn = new NullSummaryNode(); break; } case RECORD_NODE: { sn = new RecordSummaryNode(); break; } case STRING_NODE: { sn = new StringSummaryNode(); break; } case UNION_NODE: { sn = new UnionSummaryNode(); break; } default: throw new IOException("Unknown node type: " + nodeType); } sn.readFields(in); return sn; } /** * Build a Summary structure out of the given schema. Helper method. */ SummaryNode buildStructure(Schema s, String docStr) { Schema.Type stype = s.getType(); if (stype == Schema.Type.ARRAY) { return new ArraySummaryNode(buildStructure(s.getElementType(), s.getDoc()), docStr); } else if (stype == Schema.Type.BOOLEAN) { return new BooleanSummaryNode(docStr); } else if (stype == Schema.Type.BYTES) { return new BytesSummaryNode(docStr); } else if (stype == Schema.Type.DOUBLE) { return new DoubleSummaryNode(docStr); } else if (stype == Schema.Type.ENUM) { return new EnumSummaryNode(s.getFullName(), s.getEnumSymbols(), docStr); } else if (stype == Schema.Type.FIXED) { return new FixedSummaryNode(s.getFullName(), s.getFixedSize(), docStr); } else if (stype == Schema.Type.FLOAT) { return new FloatSummaryNode(docStr); } else if (stype == Schema.Type.INT) { return new IntegerSummaryNode(docStr); } else if (stype == Schema.Type.LONG) { return new LongSummaryNode(docStr); } else if (stype == Schema.Type.MAP) { return new MapSummaryNode(s.getValueType(), docStr); } else if (stype == Schema.Type.NULL) { return new NullSummaryNode(docStr); } else if (stype == Schema.Type.RECORD) { RecordSummaryNode rsn = new RecordSummaryNode(s.getFullName(), docStr); for (Field f : s.getFields()) { rsn.addField(f.name(), buildStructure(f.schema(), f.doc())); } return rsn; } else if (stype == Schema.Type.STRING) { return new StringSummaryNode(docStr); } else if (stype == Schema.Type.UNION) { UnionSummaryNode usn = new UnionSummaryNode(docStr); for (Schema subschema : s.getTypes()) { usn.addType(subschema.getType(), buildStructure(subschema, subschema.getDoc())); } } return null; } ///////////////////////////////////////////////////////// // Schema distance computation ///////////////////////////////////////////////////////// /** * Get the minimum mapping cost from a schema of size k to one of size m. * This helps us avoid mapping computations that couldn't possibly produce * a low-distance mapping. */ public static double getMinimumMappingCost(int k, int m) { return Math.abs(k - m) * Math.min(MATCHCOST_CREATE, MATCHCOST_DELETE); } /** * Find the best mapping between the current schema summary and the one provided * by the parameter. */ public SchemaMapping getBestMapping(SchemaStatisticalSummary other) { SummaryNode t1 = root; SummaryNode t2 = other.root; TreeMap<Integer, SummaryNode> t1NonLeafs = new TreeMap<Integer, SummaryNode>(); TreeMap<Integer, SummaryNode> t1Leafs = new TreeMap<Integer, SummaryNode>(); TreeMap<Integer, SummaryNode> t2NonLeafs = new TreeMap<Integer, SummaryNode>(); TreeMap<Integer, SummaryNode> t2Leafs = new TreeMap<Integer, SummaryNode>(); // // Find all the non-leaf nodes // for (SummaryNode iNode : t1.preorder()) { if (iNode.children().size() > 0) { t1NonLeafs.put(iNode.preorderCount(), iNode); } else { t1Leafs.put(iNode.preorderCount(), iNode); } } for (SummaryNode jNode : t2.preorder()) { if (jNode.children().size() > 0) { t2NonLeafs.put(jNode.preorderCount(), jNode); } else { t2Leafs.put(jNode.preorderCount(), jNode); } } // // Start by computing all the potential 1:1 leaf-level match costs. // List<DistancePair[]> allCosts = new ArrayList<DistancePair[]>(); Set<DistancePair> allKnownCostPairs = new TreeSet<DistancePair>(); for (SummaryNode iNode : t1.preorder()) { int iIdx = iNode.preorderCount(); DistancePair fromI[] = null; if (t1NonLeafs.get(iIdx) == null) { List<DistancePair> costs = new ArrayList<DistancePair>(); for (SummaryNode jNode : t2.preorder()) { int jIdx = jNode.preorderCount(); if (t2NonLeafs.get(jIdx) == null) { DistancePair dp = new DistancePair(iNode.transformCost(jNode), iNode, jNode); costs.add(dp); allKnownCostPairs.add(dp); } } costs.add(new DistancePair(iNode.deleteCost(), iNode, null)); fromI = costs.toArray(new DistancePair[costs.size()]); Arrays.sort(fromI); } allCosts.add(fromI); } // // Now pass those costs to the mapping algorithm. // Select which mapping algorithm we want to use. For now, it's 'greedy'. // return findGreedyMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allKnownCostPairs); /** boolean performTraditionalMapping = false; if (performTraditionalMapping) { return findTraditionalMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allCosts); } else { return findGreedyMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allKnownCostPairs); } **/ } /** * findTraditionalMapping() tries the best k permutations of matches and returns the best one. * The number of permutations can grow rapidly as the sizes of the two schemas grow, so this method * can be very time-consuming. */ /** SchemaMapping findTraditionalMapping(SchemaStatisticalSummary other, SummaryNode t1, SummaryNode t2, Map<Integer, SummaryNode> t1Leafs, Map<Integer, SummaryNode> t2Leafs, Map<Integer, SummaryNode> t1NonLeafs, Map<Integer, SummaryNode> t2NonLeafs, List<DistancePair[]> allCosts) { // // Figure out how far down into each attr's match-list we can go while only evaluating the // estimated top-k-scoring schema matches. (Estimated by combining independent 1:1 match scores; // no enforcement of the pigeonhole constraint.) // int MAX_CANDIDATES = 100000; int numToPeek[] = new int[allCosts.size()]; for (int i = 0; i < numToPeek.length; i++) { if (allCosts.get(i) == null) { numToPeek[i] = 0; } else { numToPeek[i] = 1; } } int numCandidates = 1; System.err.println("Num elts: " + numToPeek.length); do { int peekIndex = -1; double cheapestPeek = Double.MAX_VALUE; for (int i = 0; i < numToPeek.length; i++) { if (allCosts.get(i) != null && numToPeek[i] < allCosts.get(i).length) { double candidatePeekValue = allCosts.get(i)[numToPeek[i]].getCost(); if (candidatePeekValue < cheapestPeek) { cheapestPeek = candidatePeekValue; peekIndex = i; } } } if (peekIndex >= 0) { numToPeek[peekIndex]++; } else { break; } numCandidates = 1; for (int i = 0; i < numToPeek.length; i++) { if (numToPeek[i] >= 1) { numCandidates *= numToPeek[i]; } } } while (numCandidates < MAX_CANDIDATES); System.err.println("All cost size: " + allCosts.size() + ", number of candidates examined: " + numCandidates); System.err.println(); numCandidates = Math.max(MAX_CANDIDATES, numCandidates); // // Now the numToPeek vector tells us how many steps down to go in each attr's // ranked list of preferred matches. The product of all of these determines the # of candidates. // int curPeek[] = new int[numToPeek.length]; for (int i = 0; i < curPeek.length; i++) { if (numToPeek[i] == 0) { curPeek[i] = 0; } else { curPeek[i] = 1; } } // // Now go through all the possible configurations of top-k mappings. // // We optimize for the common case in which we have two near-flat hierarchies // DistancePair bestMatchConfig[] = new DistancePair[curPeek.length]; DistancePair matchConfig[] = new DistancePair[curPeek.length]; double bestCost = Double.MAX_VALUE; boolean peeksRemain = numCandidates > 0; long startTime = System.currentTimeMillis(); int numIters = 0; while (peeksRemain) { numIters++; //////////////////////////////////////// // Evaluate this configuration ("peek") //////////////////////////////////////// // // 1. Build a proper 'match configuration' out of the leaf-level 1:1 'curPeek'. // That means we generate record-level correspondences when justified by full // child-correspondences // for (SummaryNode iNode: t1.preorder()) { int iNodeIdx = iNode.preorderCount(); matchConfig[iNodeIdx] = null; DistancePair[] allINodeMatches = allCosts.get(iNodeIdx); if (allINodeMatches != null) { matchConfig[iNodeIdx] = allINodeMatches[curPeek[iNodeIdx]-1]; } } // // 2. Modify the current matchConfig s.t. if ALL of a non-leaf's children match ALL of // the children of a non-leaf, then the two non-leafs also match. // Because of the potential record hierarchy, this procedure needs to be repeated until // there is an iteration in which no new matches are found (or until the roots are matched). // for (Map.Entry<Integer, SummaryNode> elt: t1NonLeafs.entrySet()) { SummaryNode iNode = elt.getValue(); if (matchConfig[iNode.preorderCount()] != null) { continue; } // For each child of this t1 internal node, place the matching node's parent into a set TreeMap<Integer, SummaryNode> observedMatchParents = new TreeMap<Integer, SummaryNode>(); for (SummaryNode iChild: iNode.children()) { int iChildIdx = iChild.preorderCount(); DistancePair jMatch = matchConfig[iChildIdx]; if (jMatch != null) { if (jMatch.getNode() == null) { observedMatchParents.put(-1, iChild); } else { SummaryNode jMatchParent = jMatch.getNode().getParent(); observedMatchParents.put(jMatchParent.preorderCount(), jMatchParent); } } } // If the parent-set has just one element, then internal node iNode // should be matched to the singleton elt in the parent-set. if (observedMatchParents.size() == 1) { int matchIdx = observedMatchParents.firstKey().intValue(); if (matchIdx >= 0) { SummaryNode jMatchParent = observedMatchParents.get(matchIdx); matchConfig[iNode.preorderCount()] = new DistancePair(0, iNode, jMatchParent); } } } // // 3. Compute the total match costs. // a. The first component is the TRANSFORM costs of the discovered 1:1 leaf matches. // (Valid matches among non-leafs are free.) // double total = 0; for (int iNodeIdx = 0; iNodeIdx < matchConfig.length; iNodeIdx++) { if (matchConfig[iNodeIdx] != null) { // Get the transform cost total += matchConfig[iNodeIdx].getCost(); } } // // 3b. Compute DELETE penalties. These are elts in t1 that are NOT MATCHED to anything // in t2. Non-leaf nodes that are unmatched DO incur penalties. // // While we're there, compute the set of items in t2 that DO have a matched elt. // int numDuplicates = 0; HashSet<Integer> observedT2Nodes = new HashSet<Integer>(); for (SummaryNode iNode: t1.preorder()) { int iNodeIdx = iNode.preorderCount(); if (matchConfig[iNodeIdx] == null || matchConfig[iNodeIdx].getNode() == null) { total += iNode.deleteCost(); } else { int jIdx = matchConfig[iNodeIdx].getNode().preorderCount(); if (observedT2Nodes.contains(jIdx)) { numDuplicates++; } else { observedT2Nodes.add(jIdx); } } } // // 3c. Compute CREATE penalties. These count for any items in the target schema t2 // that have gone unmapped. // for (SummaryNode jNode: t2.preorder()) { int jIdx = jNode.preorderCount(); if (! observedT2Nodes.contains(jIdx)) { total += jNode.createCost(); } } // // 4. Impose a penalty for duplicate mappings in t2. // // // Is it the best mapping so far? // if (total < bestCost) { bestCost = total; System.arraycopy(matchConfig, 0, bestMatchConfig, 0, bestMatchConfig.length); } ///////////////////////////////////////////// // Find the next configuration to evaluate (leaf-level "peek"). // We try to do a 'breadth-first search' rather than go deep on // a single peeklist. This makes it easier to find the best match sooner, // and thus abort the process early. ///////////////////////////////////////////// peeksRemain = false; int minSeen = Integer.MAX_VALUE; int minIndex = -1; for (int i = 0; i < curPeek.length; i++) { if (curPeek[i] == 0) { continue; } else { if (curPeek[i] < numToPeek[i]) { curPeek[i]++; for (int j = i-1; j >= 0; j--) { if (curPeek[j] > 0) { curPeek[j] = 1; } } peeksRemain = true; break; } } } } long endTime = System.currentTimeMillis(); System.err.println("Evaluting peeks: " + ((endTime - startTime) / 1000.0) + " over " + numIters + " iterations."); // // ALMOST DONE: We have the best match. Now we translate it into a series of SchemaMappingOps // List<SchemaMappingOp> bestOps = new ArrayList<SchemaMappingOp>(); HashSet<Integer> bestMapTargets = new HashSet<Integer>(); for (int i = 0; i < bestMatchConfig.length; i++) { if (bestMatchConfig[i] != null && bestMatchConfig[i].getNode() != null) { int dstIdx = bestMatchConfig[i].getNode().preorderCount(); bestOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, i, other, dstIdx)); bestMapTargets.add(dstIdx); } else { bestOps.add(new SchemaMappingOp(SchemaMappingOp.DELETE_OP, this, i)); } } for (SummaryNode jNode: t2.preorder()) { int jIdx = jNode.preorderCount(); if (jNode.children().size() == 0 && ! bestMapTargets.contains(jIdx)) { bestOps.add(new SchemaMappingOp(SchemaMappingOp.CREATE_OP, other, jIdx)); } } // // All done! // return new SchemaMapping(this, other, bestCost, bestOps); } **/ /** * Greedy Mapping is sloppy, but very fast. It repeatedly accepts the best-looking pairwise * match, until there is nothing left to match. Seems to work well so far, but needs to be * tested more. */ SchemaMapping findGreedyMapping(SchemaStatisticalSummary other, SummaryNode t1, SummaryNode t2, Map<Integer, SummaryNode> t1Leafs, Map<Integer, SummaryNode> t2Leafs, Map<Integer, SummaryNode> t1NonLeafs, Map<Integer, SummaryNode> t2NonLeafs, Set<DistancePair> allKnownCostPairs) { int totalSrcs = t1Leafs.size(); int totalDsts = t2Leafs.size(); Set<Integer> observedSrcs = new TreeSet<Integer>(); Set<Integer> observedDsts = new TreeSet<Integer>(); List<DistancePair> matching = new ArrayList<DistancePair>(); List<SchemaMappingOp> outputOps = new ArrayList<SchemaMappingOp>(); double totalCost = 0; // // Find all the leaf-level matches // Map<Integer, SummaryNode> transformMap = new TreeMap<Integer, SummaryNode>(); for (DistancePair dp : allKnownCostPairs) { int srcId = dp.getSrc().preorderCount(); int dstId = dp.getNode().preorderCount(); if ((!observedSrcs.contains(srcId)) && (!observedDsts.contains(dstId))) { matching.add(dp); observedSrcs.add(srcId); observedDsts.add(dstId); outputOps.add( new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, srcId, other, dstId, dp.getCost())); transformMap.put(srcId, dp.getNode()); totalCost += dp.getCost(); if (matching.size() >= Math.min(totalSrcs, totalDsts)) { break; } } } // // Look for internal nodes that should be matched. If ALL of an internal node's children // match ALL of another internal node's children, then the two internal nodes also match. // for (Map.Entry<Integer, SummaryNode> elt : t1NonLeafs.entrySet()) { SummaryNode iNode = elt.getValue(); SortedSet<Integer> knownDstParents = new TreeSet<Integer>(); for (SummaryNode iChild : iNode.children()) { int iChildIdx = iChild.preorderCount(); SummaryNode dstNode = transformMap.get(iChildIdx); if (dstNode != null) { knownDstParents.add(dstNode.getParent().preorderCount()); } } // There's just one parent of the destination nodes, so we have found an internal node match. if (knownDstParents.size() == 1) { Integer dstIdx = knownDstParents.first(); SummaryNode dstNode = t2NonLeafs.get(dstIdx); outputOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, iNode.preorderCount(), other, dstIdx, 0)); observedSrcs.add(iNode.preorderCount()); observedDsts.add(dstIdx); } } // // If a node is in the source, but not the dest, then we need to DELETE it. // Compute the DELETE costs here. // for (SummaryNode iNode : t1.preorder()) { int iNodeIdx = iNode.preorderCount(); if (!observedSrcs.contains(iNodeIdx)) { totalCost += iNode.deleteCost(); outputOps.add(new SchemaMappingOp(SchemaMappingOp.DELETE_OP, this, iNodeIdx, iNode.deleteCost())); } } // // If a node is in the dest, but not the source, then we need to CREATE it. // Compute the CREATE costs here. // for (SummaryNode jNode : t2.preorder()) { int jNodeIdx = jNode.preorderCount(); if (!observedDsts.contains(jNodeIdx)) { totalCost += jNode.createCost(); outputOps.add(new SchemaMappingOp(SchemaMappingOp.CREATE_OP, other, jNodeIdx, jNode.createCost())); } } return new SchemaMapping(this, other, totalCost, outputOps); } class DistancePair implements Comparable { double cost; SummaryNode src; SummaryNode target; public DistancePair(double cost, SummaryNode src, SummaryNode target) { this.cost = cost; this.src = src; this.target = target; } public int compareTo(Object o) { DistancePair other = (DistancePair) o; if (cost < other.cost) { return -1; } else if (cost > other.cost) { return 1; } else { int cmp = src.preorderCount() - other.src.preorderCount(); if (cmp == 0) { cmp = target.preorderCount() - other.target.preorderCount(); } return cmp; } } public double getCost() { return cost; } public SummaryNode getSrc() { return src; } public SummaryNode getNode() { return target; } public int getIdx() { return target.preorderCount(); } public String toString() { if (target != null) { return "" + target.getDesc(false) + " cost=" + cost; } else { return " DELETE cost=" + cost; } } } //////////////////////////////////////////////// // String representation of the overall summary object //////////////////////////////////////////////// public String getDatasetLabel() { return datasetLabel; } public String dumpSummary() { return this.root.dumpSummary(0); } public String getDesc(int nodeid) { return root.getDesc(nodeid); } public String getLabel(int nodeid) { return root.getLabel(nodeid); } public String getTypeDesc(int nodeid) { return root.getTypeDesc(nodeid); } public String getDocStr(int nodeid) { return root.getDocStr(nodeid); } //////////////////////////////////////////////// // Serialization/deserialization //////////////////////////////////////////////// public void write(DataOutput out) throws IOException { out.write(MAGIC); out.write(VERSION); root.write(out); UTF8.writeString(out, datasetLabel); } public void readFields(DataInput in) throws IOException { byte magic = in.readByte(); byte version = in.readByte(); this.root = readAndCreate(in); this.root.computePreorder(-1); this.datasetLabel = UTF8.readString(in); } }