Java tutorial
/* Copyright 2012 Explorys, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.explorys.apothecary.hadoop.mr; import java.io.Closeable; import java.io.EOFException; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; public class MapFileReader implements Closeable { public static class ImmutableState implements Closeable { /** Number of index entries to skip between each entry. Zero by default. * Setting this to values larger than zero can facilitate opening large map * files using less memory. */ private int INDEX_SKIP = 0; // the index, in memory private int count = -1; private WritableComparable[] keys; private long[] positions; private WritableComparator comparator; private SequenceFile.Reader index; // whether the index Reader was closed private boolean indexClosed = false; public ImmutableState(FileSystem fs, String dirName, Configuration conf) throws IOException { INDEX_SKIP = conf.getInt("io.map.index.skip", 0); Path dir = new Path(dirName); Path indexFile = new Path(dir, MapFile.INDEX_FILE_NAME); // open the index this.index = new SequenceFile.Reader(fs, indexFile, conf); { Path dataFile = new Path(dir, MapFile.DATA_FILE_NAME); SequenceFile.Reader data = new SequenceFile.Reader(fs, dataFile, conf); this.comparator = WritableComparator.get(data.getKeyClass().asSubclass(WritableComparable.class)); data.close(); } readIndex(); } public int getCount() { return count; } public WritableComparable[] getKeys() { return keys; } public long[] getPositions() { return positions; } private void readIndex() throws IOException { // read the index entirely into memory if (this.keys != null) return; this.count = 0; this.keys = new WritableComparable[1024]; this.positions = new long[1024]; try { int skip = INDEX_SKIP; LongWritable position = new LongWritable(); WritableComparable lastKey = null; while (true) { WritableComparable k = comparator.newKey(); if (!index.next(k, position)) break; // check order to make sure comparator is compatible if (lastKey != null && comparator.compare(lastKey, k) > 0) throw new IOException("key out of order: " + k + " after " + lastKey); lastKey = k; if (skip > 0) { skip--; continue; // skip this entry } else { skip = INDEX_SKIP; // reset skip } if (count == keys.length) { // time to grow arrays int newLength = (keys.length * 3) / 2; WritableComparable[] newKeys = new WritableComparable[newLength]; long[] newPositions = new long[newLength]; System.arraycopy(keys, 0, newKeys, 0, count); System.arraycopy(positions, 0, newPositions, 0, count); keys = newKeys; positions = newPositions; } keys[count] = k; positions[count] = position.get(); count++; } } catch (EOFException e) { LOG.warn("Unexpected EOF reading " + index + " at entry #" + count + ". Ignoring."); } finally { indexClosed = true; index.close(); } } @Override public void close() throws IOException { if (!indexClosed) { index.close(); } } } private static final Log LOG = LogFactory.getLog(MapFileReader.class); private WritableComparator comparator; private WritableComparable nextKey; private long seekPosition = -1; private int seekIndex = -1; private long firstPosition; // the data, on disk private SequenceFile.Reader data; public WritableComparator getComparator() { return comparator; } /** Returns the class of keys in this file. */ public Class<?> getKeyClass() { return data.getKeyClass(); } /** Returns the class of values in this file. */ public Class<?> getValueClass() { return data.getValueClass(); } /** Construct a map reader for the named map.*/ public MapFileReader(FileSystem fs, String dirName, Configuration conf) throws IOException { this(fs, dirName, null, conf); } /** Construct a map reader for the named map using the named comparator.*/ public MapFileReader(FileSystem fs, String dirName, WritableComparator comparator, Configuration conf) throws IOException { this(fs, dirName, comparator, conf, true); } /** * Hook to allow subclasses to defer opening streams until further * initialization is complete. * @see #createDataFileReader(FileSystem, Path, Configuration) */ protected MapFileReader(FileSystem fs, String dirName, WritableComparator comparator, Configuration conf, boolean open) throws IOException { if (open) { open(fs, dirName, comparator, conf); } } protected synchronized void open(FileSystem fs, String dirName, WritableComparator comparator, Configuration conf) throws IOException { Path dir = new Path(dirName); Path dataFile = new Path(dir, MapFile.DATA_FILE_NAME); // open the data this.data = createDataFileReader(fs, dataFile, conf); this.firstPosition = data.getPosition(); if (comparator == null) this.comparator = WritableComparator.get(data.getKeyClass().asSubclass(WritableComparable.class)); else this.comparator = comparator; } /** * Override this method to specialize the type of * {@link SequenceFile.Reader} returned. */ protected SequenceFile.Reader createDataFileReader(FileSystem fs, Path dataFile, Configuration conf) throws IOException { return new SequenceFile.Reader(fs, dataFile, conf); } /** Re-positions the reader before its first key. */ public synchronized void reset() throws IOException { data.seek(firstPosition); } /** Get the key at approximately the middle of the file. * * @throws IOException */ public synchronized WritableComparable midKey(ImmutableState state) throws IOException { int pos = ((state.getCount() - 1) / 2); // middle of the index if (pos < 0) { throw new IOException("MapFile empty"); } return state.getKeys()[pos]; } /** Reads the final key from the file. * * @param key key to read into */ public synchronized void finalKey(WritableComparable key, ImmutableState state) throws IOException { long originalPosition = data.getPosition(); // save position try { // make sure index is valid if (state.getCount() > 0) { data.seek(state.getPositions()[state.getCount() - 1]); // skip to last indexed entry } else { reset(); // start at the beginning } while (data.next(key)) { } // scan to eof } finally { data.seek(originalPosition); // restore position } } /** Positions the reader at the named key, or if none such exists, at the * first entry after the named key. Returns true iff the named key exists * in this map. */ public synchronized boolean seek(WritableComparable key, ImmutableState state) throws IOException { return seekInternal(key, state) == 0; } /** * Positions the reader at the named key, or if none such exists, at the * first entry after the named key. * * @return 0 - exact match found * < 0 - positioned at next record * 1 - no more records in file */ private synchronized int seekInternal(WritableComparable key, ImmutableState state) throws IOException { return seekInternal(key, false, state); } /** * Positions the reader at the named key, or if none such exists, at the * key that falls just before or just after dependent on how the * <code>before</code> parameter is set. * * @param before - IF true, and <code>key</code> does not exist, position * file at entry that falls just before <code>key</code>. Otherwise, * position file at record that sorts just after. * @return 0 - exact match found * < 0 - positioned at next record * 1 - no more records in file */ private synchronized int seekInternal(WritableComparable key, final boolean before, ImmutableState state) throws IOException { if (seekIndex != -1 // seeked before && seekIndex + 1 < state.getCount() && comparator.compare(key, state.getKeys()[seekIndex + 1]) < 0 // before next indexed && comparator.compare(key, nextKey) >= 0) { // but after last seeked // do nothing } else { seekIndex = binarySearch(key, state); if (seekIndex < 0) // decode insertion point seekIndex = -seekIndex - 2; if (seekIndex == -1) // belongs before first entry seekPosition = firstPosition; // use beginning of file else seekPosition = state.getPositions()[seekIndex]; // else use index } data.seek(seekPosition); if (nextKey == null) nextKey = comparator.newKey(); // If we're looking for the key before, we need to keep track // of the position we got the current key as well as the position // of the key before it. long prevPosition = -1; long curPosition = seekPosition; while (data.next(nextKey)) { int c = comparator.compare(key, nextKey); if (c <= 0) { // at or beyond desired if (before && c != 0) { if (prevPosition == -1) { // We're on the first record of this index block // and we've already passed the search key. Therefore // we must be at the beginning of the file, so seek // to the beginning of this block and return c data.seek(curPosition); } else { // We have a previous record to back up to data.seek(prevPosition); data.next(nextKey); // now that we've rewound, the search key must be greater than this key return 1; } } return c; } if (before) { prevPosition = curPosition; curPosition = data.getPosition(); } } return 1; } private int binarySearch(WritableComparable key, ImmutableState state) { int low = 0; int high = state.getCount() - 1; while (low <= high) { int mid = (low + high) >>> 1; WritableComparable midVal = state.getKeys()[mid]; int cmp = comparator.compare(midVal, key); if (cmp < 0) low = mid + 1; else if (cmp > 0) high = mid - 1; else return mid; // key found } return -(low + 1); // key not found. } /** Read the next key/value pair in the map into <code>key</code> and * <code>val</code>. Returns true if such a pair exists and false when at * the end of the map */ public synchronized boolean next(WritableComparable key, Writable val) throws IOException { return data.next(key, val); } /** Return the value for the named key, or null if none exists. */ public synchronized Writable get(WritableComparable key, Writable val, ImmutableState state) throws IOException { if (seek(key, state)) { data.getCurrentValue(val); return val; } else return null; } /** * Finds the record that is the closest match to the specified key. * Returns <code>key</code> or if it does not exist, at the first entry * after the named key. * - * @param key - key that we're trying to find - * @param val - data value if key is found - * @return - the key that was the closest match or null if eof. */ public synchronized WritableComparable getClosest(WritableComparable key, Writable val, ImmutableState state) throws IOException { return getClosest(key, val, false, state); } /** * Finds the record that is the closest match to the specified key. * * @param key - key that we're trying to find * @param val - data value if key is found * @param before - IF true, and <code>key</code> does not exist, return * the first entry that falls just before the <code>key</code>. Otherwise, * return the record that sorts just after. * @return - the key that was the closest match or null if eof. */ public synchronized WritableComparable getClosest(WritableComparable key, Writable val, final boolean before, ImmutableState state) throws IOException { int c = seekInternal(key, before, state); // If we didn't get an exact match, and we ended up in the wrong // direction relative to the query key, return null since we // must be at the beginning or end of the file. if ((!before && c > 0) || (before && c < 0)) { return null; } data.getCurrentValue(val); return nextKey; } /** Close the map. */ public synchronized void close() throws IOException { data.close(); } }