input_format.HFileRecordReaderBufferedNoScan.java Source code

Introduction

Here is the source code for input_format.HFileRecordReaderBufferedNoScan.java
Source

/*******************************************************************************
 * Copyright (c) 2012 Nikos Papailiou. 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *     Nikos Papailiou - initial API and implementation
 ******************************************************************************/
package input_format;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import javaewah.EWAHCompressedBitmap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.hbase.io.hfile.HFile.Reader;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileScanner;

import com.ibm.icu.util.StringTokenizer;

import partialJoin.BufferedRecordWriter;

import byte_import.MyNewTotalOrderPartitioner;
import bytes.ByteValues;
import bytes.NotSupportedDatatypeException;

public class HFileRecordReaderBufferedNoScan extends RecordReader<ImmutableBytesWritable, Text> {

    private ImmutableBytesWritable key = null;
    private Text value = null;
    private TableColumnSplit tsplit = null;
    private Reader reader = null;
    private HFileScanner scanner = null;
    private KeyValue kv = null;
    private boolean more = false;
    private final Configuration HBconf = HBaseConfiguration.create();
    private byte[] stopr;

    private static long id;
    private static int size = 0;
    private static int varsno;
    private static float progress = 0;
    private static String v1, v2;

    /**
     * Closes the split.
     * 
     * @see org.apache.hadoop.mapreduce.RecordReader#close()
     */
    @Override
    public void close() {
        try {
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Returns the current key.
     *  
     * @return The current key.
     * @throws IOException
     * @throws InterruptedException When the job is aborted.
     * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentKey()
     */
    @Override
    public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    /**
     * Returns the current value.
     * 
     * @return The current value.
     * @throws IOException When the value is faulty.
     * @throws InterruptedException When the job is aborted.
     * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentValue()
     */
    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {

        return value;
    }

    /**
     * Initializes the reader.
     * 
     * @param inputsplit  The split to work with.
     * @param context  The current task context.
     * @throws IOException When setting up the reader fails.
     * @throws InterruptedException When the job is aborted.
     * @see org.apache.hadoop.mapreduce.RecordReader#initialize(
     *   org.apache.hadoop.mapreduce.InputSplit, 
     *   org.apache.hadoop.mapreduce.TaskAttemptContext)
     */
    @Override
    public void initialize(InputSplit inputsplit, TaskAttemptContext context)
            throws IOException, InterruptedException {
        tsplit = (TableColumnSplit) inputsplit;

        HTable table = new HTable(HBconf, tsplit.getTable());
        //HTable table = new HTable( HBconf, "H2RDF" );

        //table.flushCommits();
        byte[] startr = tsplit.getStartRow();

        stopr = tsplit.getStopRow();
        System.out.println("start: " + Bytes.toStringBinary(startr));
        System.out.println("stop: " + Bytes.toStringBinary(stopr));

        FileSystem fs = FileSystem.get(HBconf);
        String dir = "";
        if (startr.length == 1) {
            byte[] st = new byte[stopr.length - 1];
            for (int i = 0; i < st.length; i++) {
                st[i] = stopr[i];
            }
            dir = "/hbase/" + tsplit.getTable() + "/"
                    + table.getRegionLocation(stopr).getRegionInfo().getEncodedName() + "/A";
        } else {
            dir = "/hbase/" + tsplit.getTable() + "/"
                    + table.getRegionLocation(startr).getRegionInfo().getEncodedName() + "/A";
        }

        Path regionDir = new Path(dir);
        Path file = null;
        Path[] hfiles = FileUtil.stat2Paths(fs.listStatus(regionDir));
        for (Path hfile : hfiles) {
            file = new Path(dir + "/" + hfile.getName());
        }
        System.out.println(dir + "/" + file.getName());

        reader = HFile.createReader(fs, file, new CacheConfig(HBconf));
        //reader = new Reader(fs, file, null, false);

        // Load up the index.
        reader.loadFileInfo();
        // Get a scanner that caches and that does not use pread.
        scanner = reader.getScanner(false, true);

        KeyValue rowKey = KeyValue.createFirstOnRow(startr);
        scanner.seekBefore(rowKey.getKey());
        id = table.getRegionLocation(startr).getRegionInfo().getRegionId();
        if (!scanner.isSeeked()) {
            //System.out.println(table.getRegionLocation(startr).getRegionInfo().getRegionId());
            scanner.seekTo();
        }
        while (scanner.next()) {
            kv = scanner.getKeyValue();
            if (Bytes.compareTo(kv.getRow(), startr) >= 0) {
                more = true;
                break;
            }
        }

        if (!tsplit.getFname().startsWith("T")) {
            /*FSDataInputStream v = fs.open(new Path(context.getConfiguration().get("nikos.inputfile")));
            BufferedReader read = new BufferedReader(new InputStreamReader(v));
            read.readLine();
            read.readLine();
            v.close();*/

            String vars = tsplit.getVars();
            System.out.println(vars);
            StringTokenizer vtok = new StringTokenizer(vars);
            varsno = 0;
            while (vtok.hasMoreTokens()) {
                vtok.nextToken();
                varsno++;
            }
            if (varsno == 1) {
                StringTokenizer vtok2 = new StringTokenizer(vars);
                v1 = vtok2.nextToken();
            } else if (varsno == 2) {
                StringTokenizer vtok2 = new StringTokenizer(vars);
                v1 = vtok2.nextToken();
                v2 = vtok2.nextToken();
            }
            progress = (float) 0.2;
        } else {
            varsno = 15;
        }
    }

    /**
     * Positions the record reader to the next record.
     *  
     * @return <code>true</code> if there was another record.
     * @throws IOException When reading the record failed.
     * @throws InterruptedException When the job was aborted.
     * @see org.apache.hadoop.mapreduce.RecordReader#nextKeyValue()
     */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (!more)
            return more;
        if (key == null)
            key = new ImmutableBytesWritable();
        if (value == null)
            value = new Text();
        if (!scanner.isSeeked()) {//bug
            return false;
        }

        try {
            if (varsno == 15) {
                if (Bytes.compareTo(kv.getRow(), stopr) <= 0) {
                    byte[] indexKey = kv.getRow();
                    byte[] indexKey1 = new byte[ByteValues.totalBytes];
                    for (int i = 0; i < indexKey1.length; i++) {
                        indexKey1[i] = indexKey[i + 1];
                    }
                    value.set(tsplit.getFname() + "!" + ByteValues.getStringValue(indexKey1) + "$$"
                            + Bytes.toString(kv.getValue()));
                    more = scanner.next();
                    if (more)
                        kv = scanner.getKeyValue();
                    return more;
                }
                return false;

            }

            if (varsno == 0) {
                if (Bytes.compareTo(kv.getRow(), stopr) <= 0) {
                    key.set(kv.getRow());
                    value.set(tsplit.getFname() + "!" + ByteValues.getStringValue(kv.getValue()));
                    more = scanner.next();
                    if (more)
                        kv = scanner.getKeyValue();
                    return more;
                }
                return false;

            }
            /*if(kv.getRow()[17]==(byte)255){
               more=scanner.next();
               if(more){
                  kv = scanner.getKeyValue();
                  //return nextKeyValue();
               }
               return more;
            }*/
            if (varsno == 1) {
                if (Bytes.compareTo(kv.getRow(), stopr) <= 0) {
                    value.set(tsplit.getFname() + "!" + v1 + "#" + ByteValues.getStringValue(kv.getQualifier())
                            + "_");
                    more = scanner.next();
                    if (more)
                        kv = scanner.getKeyValue();
                    return more;
                }
                return false;
            } else if (varsno == 2) {
                if (Bytes.compareTo(kv.getRow(), stopr) <= 0) {
                    byte[] curkey = kv.getRow();
                    byte[] r1 = new byte[ByteValues.totalBytes];
                    for (int j = 0; j < r1.length; j++) {
                        r1[j] = curkey[ByteValues.totalBytes + 1 + j];
                    }
                    String pat = tsplit.getFname() + "!" + v1 + "#" + ByteValues.getStringValue(r1);
                    String buffer = "!" + v2 + "#" + ByteValues.getStringValue(kv.getQualifier()) + "_";
                    boolean flush = false;
                    while (scanner.next() && rowEquals((kv = scanner.getKeyValue()).getRow(), curkey)) {
                        buffer += ByteValues.getStringValue(kv.getQualifier()) + "_";
                        if (buffer.length() >= 10000) {
                            value.set(pat + buffer);
                            flush = true;
                            break;
                        }
                    }
                    if (!flush) {
                        value.set(pat + buffer);
                        //System.out.println(pat+buffer);
                        if (!Bytes.equals(kv.getRow(), curkey)) {
                            return true;
                        } else {
                            return false;
                        }
                    } else {
                        more = scanner.next();
                        if (more)
                            kv = scanner.getKeyValue();
                        return more;
                    }
                }
                return false;
            }
        } catch (NotSupportedDatatypeException e) {
            throw new InterruptedException("Not supported datatype");
        }

        return false;
    }

    private boolean rowEquals(byte[] row, byte[] curkey) {
        boolean ret = true;
        for (int i = 0; i < 1 + 2 * ByteValues.totalBytes; i++) {
            if (row[i] != curkey[i]) {
                ret = false;
                break;
            }
        }
        return ret;
    }

    /**
       * The current progress of the record reader through its data.
       * 
       * @return A number between 0.0 and 1.0, the fraction of the data read.
       * @see org.apache.hadoop.mapreduce.RecordReader#getProgress()
       */
    @Override
    public float getProgress() {
        // Depends on the total number of tuples
        return progress;
    }

}