Java tutorial
/******************************************************************************* * Copyright (c) 2012 Nikos Papailiou. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Public License v3.0 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/gpl.html * * Contributors: * Nikos Papailiou - initial API and implementation ******************************************************************************/ package input_format; import java.io.IOException; import java.util.Iterator; import javaewah.EWAHCompressedBitmap; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.io.Text; import org.apache.hadoop.hbase.io.hfile.HFile.Reader; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.HFileScanner; public class HFileRecordReaderBitmapCrossRegion extends RecordReader<ImmutableBytesWritable, Text> { private ImmutableBytesWritable key = null; private Text value = null; private TableColumnSplit tsplit = null; private Reader reader = null; private HFileScanner scanner = null; private KeyValue kv = null; private boolean more = false; private int regions = 0, first, processed, totalRecords; private byte[] lastRowKey = null; private HBaseConfiguration HBconf = new HBaseConfiguration(); private byte[] stopr, startr; private HTable table; private Iterator<Integer> BitmapIter; /** * Closes the split. * * @see org.apache.hadoop.mapreduce.RecordReader#close() */ @Override public void close() { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } /** * Returns the current key. * * @return The current key. * @throws IOException * @throws InterruptedException When the job is aborted. * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentKey() */ @Override public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException { return key; } /** * Returns the current value. * * @return The current value. * @throws IOException When the value is faulty. * @throws InterruptedException When the job is aborted. * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentValue() */ @Override public Text getCurrentValue() throws IOException, InterruptedException { return value; } /** * Initializes the reader. * * @param inputsplit The split to work with. * @param context The current task context. * @throws IOException When setting up the reader fails. * @throws InterruptedException When the job is aborted. * @see org.apache.hadoop.mapreduce.RecordReader#initialize( * org.apache.hadoop.mapreduce.InputSplit, * org.apache.hadoop.mapreduce.TaskAttemptContext) */ @Override public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException { tsplit = (TableColumnSplit) inputsplit; EWAHCompressedBitmap regionBitmap = tsplit.getRegionBitmap(); BitmapIter = regionBitmap.iterator(); totalRecords = regionBitmap.cardinality(); System.out.println("contains: " + regionBitmap.cardinality() + " size:" + regionBitmap.sizeInBytes()); table = new HTable(HBconf, tsplit.getTable()); //table.flushCommits(); processed = 0; nextBitmapRegion(); } private void nextBitmapRegion() { regions++; //System.out.println("next region: "+regions); if (!BitmapIter.hasNext()) { System.out.println("Bitmap end"); more = false; return; } int curBitmapInt = BitmapIter.next(); processed++; //System.out.println(curBitmapInt); byte[] curBitmapByteInt = Bytes.toBytes(curBitmapInt); //System.out.println("curInt: "+Bytes.toStringBinary(curBitmapByteInt)); startr = new byte[8 + 1]; stopr = new byte[8 + 1]; startr[0] = (byte) 1; stopr[0] = (byte) 1; for (int j = 1; j < curBitmapByteInt.length + 1; j++) { startr[j] = curBitmapByteInt[j - 1]; stopr[j] = curBitmapByteInt[j - 1]; } for (int j = 1 + curBitmapByteInt.length; j < startr.length; j++) { startr[j] = (byte) 0; stopr[j] = (byte) 255; } //System.out.println("Next Bitmap start: "+Bytes.toStringBinary(startr)); KeyValue rowKey = KeyValue.createFirstOnRow(startr); try { table.flushCommits(); if (lastRowKey == null) {//read the first HFile of the bitmap FileSystem fs = FileSystem.get(HBconf); String dir = ""; dir = "/hbase/" + tsplit.getTable() + "/" + table.getRegionLocation(startr).getRegionInfo().getEncodedName() + "/A"; Path regionDir = new Path(dir); Path file = null; Path[] hfiles = FileUtil.stat2Paths(fs.listStatus(regionDir)); for (Path hfile : hfiles) { file = new Path(dir + "/" + hfile.getName()); } //System.out.println(dir); System.out.println("First HFile: " + dir + "/" + file.getName()); reader = HFile.createReader(fs, file, new CacheConfig(HBconf)); //reader = new Reader(fs, file, null, false); // Load up the index. lastRowKey = table.getRegionLocation(startr).getRegionInfo().getEndKey(); reader.loadFileInfo(); // Get a scanner that caches and that does not use pread. scanner = reader.getScanner(false, true); scanner.seekBefore(rowKey.getKey()); first = 1; while (scanner.next()) { kv = scanner.getKeyValue(); //System.out.println("first key in Hfile: "+Bytes.toStringBinary(kv.getRow())); if (Bytes.compareTo(kv.getRow(), startr) >= 0 && Bytes.compareTo(kv.getRow(), stopr) <= 0) { //System.out.println("curkey: "+Bytes.toStringBinary(kv.getRow())); more = true; break; } if (Bytes.compareTo(kv.getRow(), stopr) > 0) { if (BitmapIter.hasNext()) { nextBitmapRegion(); } else { System.out.println("Bitmap end"); more = false; break; } } } } else { if (Bytes.compareTo(lastRowKey, startr) > 0) {//same region scanner.seekBefore(rowKey.getKey()); //System.out.println("Same region"); first = 1; while (scanner.next()) { kv = scanner.getKeyValue(); //System.out.println("next key: "+Bytes.toStringBinary(kv.getRow())); if (Bytes.compareTo(kv.getRow(), startr) >= 0 && Bytes.compareTo(kv.getRow(), stopr) <= 0) { //System.out.println("curkey: "+Bytes.toStringBinary(kv.getRow())); more = true; break; } if (Bytes.compareTo(kv.getRow(), stopr) > 0) { if (BitmapIter.hasNext()) { nextBitmapRegion(); } else { System.out.println("Bitmap end"); more = false; break; } } } } else {//open new HFile FileSystem fs = FileSystem.get(HBconf); String dir = ""; dir = "/hbase/" + tsplit.getTable() + "/" + table.getRegionLocation(startr).getRegionInfo().getEncodedName() + "/A"; Path regionDir = new Path(dir); Path file = null; Path[] hfiles = FileUtil.stat2Paths(fs.listStatus(regionDir)); for (Path hfile : hfiles) { file = new Path(dir + "/" + hfile.getName()); } System.out.println("New Hfile" + dir + "/" + file.getName()); reader = HFile.createReader(fs, file, new CacheConfig(HBconf)); //reader = new Reader(fs, file, null, false); // Load up the index. lastRowKey = table.getRegionLocation(startr).getRegionInfo().getEndKey(); reader.loadFileInfo(); // Get a scanner that caches and that does not use pread. scanner = reader.getScanner(false, true); scanner.seekBefore(rowKey.getKey()); first = 1; while (scanner.next()) { kv = scanner.getKeyValue(); //System.out.println("first key in Hfile: "+Bytes.toStringBinary(kv.getRow())); if (Bytes.compareTo(kv.getRow(), startr) >= 0 && Bytes.compareTo(kv.getRow(), stopr) <= 0) { //System.out.println("curkey: "+Bytes.toStringBinary(kv.getRow())); more = true; break; } if (Bytes.compareTo(kv.getRow(), stopr) > 0) { if (BitmapIter.hasNext()) { nextBitmapRegion(); } else { System.out.println("Bitmap end"); more = false; break; } } } } } } catch (IOException e) { e.printStackTrace(); } } /** * Positions the record reader to the next record. * * @return <code>true</code> if there was another record. * @throws IOException When reading the record failed. * @throws InterruptedException When the job was aborted. * @see org.apache.hadoop.mapreduce.RecordReader#nextKeyValue() */ @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (key == null) key = new ImmutableBytesWritable(); if (value == null) value = new Text(); if (first == 1) { if (more) { byte[] indexKey = kv.getRow(); byte[] indexKey1 = new byte[8]; for (int i = 0; i < indexKey1.length; i++) { indexKey1[i] = indexKey[i + 1]; } value.set(tsplit.getFname() + "!" + Bytes.toLong(indexKey1) + "$$" + Bytes.toString(kv.getValue())); //System.out.println("used key: "+Bytes.toStringBinary(kv.getRow())); } first = 2; return more; } if (!more) return more; if (!scanner.isSeeked()) {//bug System.out.println("Not seeked"); return nextHfile(); } if (scanner.next()) { kv = scanner.getKeyValue(); //System.out.println("next key: "+Bytes.toStringBinary(kv.getRow())); if (Bytes.compareTo(kv.getRow(), startr) >= 0 && Bytes.compareTo(kv.getRow(), stopr) <= 0) { byte[] indexKey = kv.getRow(); byte[] indexKey1 = new byte[8]; for (int i = 0; i < indexKey1.length; i++) { indexKey1[i] = indexKey[i + 1]; } value.set(tsplit.getFname() + "!" + Bytes.toLong(indexKey1) + "$$" + Bytes.toString(kv.getValue())); //System.out.println("used key: "+Bytes.toStringBinary(kv.getRow())); return true; } if (Bytes.compareTo(kv.getRow(), stopr) > 0) { if (BitmapIter.hasNext()) { nextBitmapRegion(); if (!scanner.isSeeked()) {//bug System.out.println("Not seeked"); return nextHfile(); } if (more) { byte[] indexKey = kv.getRow(); byte[] indexKey1 = new byte[8]; for (int i = 0; i < indexKey1.length; i++) { indexKey1[i] = indexKey[i + 1]; } value.set(tsplit.getFname() + "!" + Bytes.toLong(indexKey1) + "$$" + Bytes.toString(kv.getValue())); //System.out.println("used key: "+Bytes.toStringBinary(kv.getRow())); } first = 2; return more; } else { System.out.println("Bitmap end"); return false; } } else { System.out.println("Bug"); return false; } } //next region return nextHfile(); } private boolean nextHfile() { FileSystem fs; try { fs = FileSystem.get(HBconf); String dir = ""; dir = "/hbase/" + tsplit.getTable() + "/" + table.getRegionLocation(lastRowKey).getRegionInfo().getEncodedName() + "/A"; Path regionDir = new Path(dir); Path file = null; Path[] hfiles = FileUtil.stat2Paths(fs.listStatus(regionDir)); for (Path hfile : hfiles) { file = new Path(dir + "/" + hfile.getName()); } //System.out.println("Last row key"+Bytes.toStringBinary(lastRowKey)); System.out.println("HFile: " + dir + "/" + file.getName()); reader = HFile.createReader(fs, file, new CacheConfig(HBconf)); //reader = new Reader(fs, file, null, false); // Load up the index. lastRowKey = table.getRegionLocation(lastRowKey).getRegionInfo().getEndKey(); reader.loadFileInfo(); // Get a scanner that caches and that does not use pread. scanner = reader.getScanner(false, true); scanner.seekTo(); first = 1; if (scanner.next()) { kv = scanner.getKeyValue(); //System.out.println("next key after change: "+Bytes.toStringBinary(kv.getRow())); if (Bytes.compareTo(kv.getRow(), startr) >= 0 && Bytes.compareTo(kv.getRow(), stopr) <= 0) { byte[] indexKey = kv.getRow(); byte[] indexKey1 = new byte[8]; for (int i = 0; i < indexKey1.length; i++) { indexKey1[i] = indexKey[i + 1]; } value.set(tsplit.getFname() + "!" + Bytes.toLong(indexKey1) + "$$" + Bytes.toString(kv.getValue())); //System.out.println("used key: "+Bytes.toStringBinary(kv.getRow())); return true; } if (Bytes.compareTo(kv.getRow(), stopr) > 0) { if (BitmapIter.hasNext()) { nextBitmapRegion(); if (!scanner.isSeeked()) {//bug System.out.println("Not seeked"); return nextHfile(); } if (more) { byte[] indexKey = kv.getRow(); byte[] indexKey1 = new byte[8]; for (int i = 0; i < indexKey1.length; i++) { indexKey1[i] = indexKey[i + 1]; } value.set(tsplit.getFname() + "!" + Bytes.toLong(indexKey1) + "$$" + Bytes.toString(kv.getValue())); //System.out.println("used key after change: "+Bytes.toStringBinary(kv.getRow())); } first = 2; return more; } else { System.out.println("Bitmap end"); return false; } } else { System.out.println("Bug"); return false; } } } catch (IOException e) { e.printStackTrace(); } System.out.println("Bug"); return false; } /** * The current progress of the record reader through its data. * * @return A number between 0.0 and 1.0, the fraction of the data read. * @see org.apache.hadoop.mapreduce.RecordReader#getProgress() */ @Override public float getProgress() { // Depends on the total number of tuples float progress = (float) processed / totalRecords; if (progress < 1) return progress; else return new Float(1.0); } }