infinidb.hadoop.db.InfiniDBInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for infinidb.hadoop.db.InfiniDBInputFormat.java

Source

/*
 * Copyright (c) 2014 InfiniDB, Inc.
 *
 * InfiniDB, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package infinidb.hadoop.db;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.mapred.lib.db.*;

/**
 * A InputFormat that reads input data from an SQL table.
 * <p>
 * DBInputFormat emits LongWritables containing the record number as 
 * key and DBWritables as value. 
 * 
 * The SQL query, and input class can be using one of the two 
 * setInput methods.
 */
public class InfiniDBInputFormat<T extends DBWritable> implements InputFormat<LongWritable, T>, JobConfigurable {
    /**
     * A RecordReader that reads records from a SQL table.
     * Emits LongWritables containing the record number as 
     * key and DBWritables as value.
     */
    protected class DBRecordReader implements RecordReader<LongWritable, T> {
        private ResultSet results;

        private Statement statement;

        private Class<T> inputClass;

        private JobConf job;

        private InfiniDBInputSplit split;

        private long pos = 0;

        /**
         * @param split The InputSplit to read data for
         * @throws SQLException 
         */
        protected DBRecordReader(InfiniDBInputSplit split, Class<T> inputClass, JobConf job) throws SQLException {
            this.inputClass = inputClass;
            this.split = split;
            this.job = job;

            statement = connection.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);

            //statement.setFetchSize(Integer.MIN_VALUE);
            results = statement.executeQuery(getSelectQuery());
        }

        /** @InfiniDB */
        public <S> String concat(S[] arr, String sep) {
            String ret = "";
            for (int i = 0; i < arr.length; i++) {
                ret = ret + arr[i];
                if (i < arr.length - 1) {
                    ret = ret + sep;
                }
            }
            return ret;
        }

        /** @InfiniDB Returns the query for selecting the records, 
         * subclasses can override this for custom behaviour.*/
        protected String getSelectQuery() {
            InfiniDBConfiguration conf = new InfiniDBConfiguration(job);
            StringBuilder query = new StringBuilder();
            query.append("SELECT ");
            query.append(concat(conf.getInputFieldNames(), ","));
            query.append(" FROM ");
            query.append(conf.getInputTableName());
            query.append(" WHERE ");
            query.append(split.splitKey + ">=" + split.getStart());
            query.append(" AND ");
            query.append(split.splitKey + "<" + split.getEnd());
            if (conditions != null && conditions.length() > 0)
                query.append(" AND (").append(conditions).append(")");
            return query.toString();
        }

        /** {@inheritDoc} */
        public void close() throws IOException {
            try {
                connection.commit();
                results.close();
                statement.close();
            } catch (SQLException e) {
                throw new IOException(e.getMessage());
            }
        }

        /** {@inheritDoc} */
        public LongWritable createKey() {
            return new LongWritable();
        }

        /** {@inheritDoc} */
        public T createValue() {
            return ReflectionUtils.newInstance(inputClass, job);
        }

        /** {@inheritDoc} */
        public long getPos() throws IOException {
            return pos;
        }

        /** {@inheritDoc} */
        public float getProgress() throws IOException {
            return pos / (float) split.getLength();
        }

        /** {@inheritDoc} */
        public boolean next(LongWritable key, T value) throws IOException {
            try {
                if (!results.next())
                    return false;

                // Set the key field value as the output key value
                key.set(pos + split.getStart());

                value.readFields(results);

                pos++;
            } catch (SQLException e) {
                throw new IOException(e.getMessage());
            }
            return true;
        }
    }

    /**
     * A Class that does nothing, implementing DBWritable
     */
    public static class NullDBWritable implements DBWritable, Writable {
        @Override
        public void readFields(DataInput in) throws IOException {
        }

        @Override
        public void readFields(ResultSet arg0) throws SQLException {
        }

        @Override
        public void write(DataOutput out) throws IOException {
        }

        @Override
        public void write(PreparedStatement arg0) throws SQLException {
        }
    }

    /**
     * A InputSplit that spans a set of rows
     */
    protected static class InfiniDBInputSplit implements InputSplit {

        private long end = 0;
        private long start = 0;
        private String splitKey;

        /**
         * Default Constructor
         */
        public InfiniDBInputSplit() {
        }

        /**
         * @InfiniDB
         * Convenience Constructor
         * @param start the index of the first row to select
         * @param end the index of the last row to select
         */
        public InfiniDBInputSplit(long start, long end, String key) {
            this.start = start;
            this.end = end;
            this.splitKey = key;
        }

        /** {@inheritDoc} */
        public String[] getLocations() throws IOException {
            return new String[] {};
        }

        /**
         * @return The index of the first row to select
         */
        public long getStart() {
            return start;
        }

        /**
         * @return The index of the last row to select
         */
        public long getEnd() {
            return end;
        }

        /**
         * @return The total row count in this split
         */
        public long getLength() throws IOException {
            return end - start;
        }

        /** {@inheritDoc} */
        public void readFields(DataInput input) throws IOException {
            start = input.readLong();
            end = input.readLong();
            splitKey = WritableUtils.readString(input);
        }

        /** {@inheritDoc} */
        public void write(DataOutput output) throws IOException {
            output.writeLong(start);
            output.writeLong(end);
            WritableUtils.writeString(output, splitKey);
        }
    }

    private String conditions;

    private Connection connection;

    private String tableName;

    private String[] fieldNames;

    private InfiniDBConfiguration dbConf;

    /** {@inheritDoc} */
    public void configure(JobConf job) {

        dbConf = new InfiniDBConfiguration(job);

        try {
            this.connection = dbConf.getConnection();
            this.connection.setAutoCommit(false);
            connection.setTransactionIsolation(Connection.TRANSACTION_SERIALIZABLE);
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }

        tableName = dbConf.getInputTableName();
        fieldNames = dbConf.getInputFieldNames();
        conditions = dbConf.getInputConditions();
    }

    /** {@inheritDoc} */
    @SuppressWarnings("unchecked")
    public RecordReader<LongWritable, T> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException {

        Class inputClass = dbConf.getInputClass();
        try {
            return new DBRecordReader((InfiniDBInputSplit) split, inputClass, job);
        } catch (SQLException ex) {
            throw new IOException(ex.getMessage());
        }
    }

    /** @InfiniDB */
    private long getMaxVal(InfiniDBConfiguration conf, Connection conn, String tableName, String col) {
        if (conf.getMaxVal() != null) {
            return conf.getMaxVal();
        }
        try {
            PreparedStatement s = conn.prepareStatement("SELECT MAX(" + col + ") FROM " + tableName);
            ResultSet rs = s.executeQuery();
            rs.next();
            long ret = rs.getLong(1);
            rs.close();
            s.close();
            return ret;
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }
    }

    /** @InfiniDB */
    private long getMinVal(InfiniDBConfiguration conf, Connection conn, String tableName, String col) {
        if (conf.getMinVal() != null) {
            return conf.getMinVal();
        }
        try {
            PreparedStatement s = conn.prepareStatement("SELECT MIN(" + col + ") FROM " + tableName);
            ResultSet rs = s.executeQuery();
            rs.next();
            long ret = rs.getLong(1);
            rs.close();
            s.close();
            return ret;
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }
    }

    /** {@inheritDoc} 
     * @InfiniDB 
     */
    public InputSplit[] getSplits(JobConf job, int chunks) throws IOException {

        try {
            InfiniDBConfiguration conf = new InfiniDBConfiguration(job);
            Connection conn = conf.getConnection();
            String splitKey = conf.getSplitKey();
            long maxVal = getMaxVal(conf, conn, conf.getInputTableName(), conf.getSplitKey());
            long minVal = getMinVal(conf, conn, conf.getInputTableName(), conf.getSplitKey());
            System.out.println("max=" + maxVal);
            System.out.println("min=" + minVal);

            InputSplit[] ret = new InputSplit[chunks];
            long chunkSize = (maxVal - minVal + 1) / chunks + 1;
            long start = minVal;
            for (int i = 0; i < chunks; i++) {
                ret[i] = new InfiniDBInputSplit(start, start + chunkSize, splitKey);
                start += chunkSize;
            }

            conn.close();
            return ret;
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }

    }

    /** Returns the query for getting the total number of rows, 
     * subclasses can override this for custom behaviour.*/
    protected String getCountQuery() {

        if (dbConf.getInputCountQuery() != null) {
            return dbConf.getInputCountQuery();
        }

        StringBuilder query = new StringBuilder();
        query.append("SELECT COUNT(*) FROM " + tableName);

        if (conditions != null && conditions.length() > 0)
            query.append(" WHERE " + conditions);
        return query.toString();
    }

    /**
     * @InfiniDB
     * Initializes the map-part of the job with the appropriate input settings.
     * 
     * @param job The job
     * @param inputClass the class object implementing DBWritable, which is the 
     * Java object holding tuple fields.
     * @param tableName The table to read data from
     * @param conditions The condition which to select data with, eg. '(updated >
     * 20070101 AND length > 0)'
     * @param key the field name used for split key.
     * @param fieldNames The field names in the table
     * @see #setInput(JobConf, Class, String, String)
     */
    public static void setInput(JobConf job, Class<? extends DBWritable> inputClass, String tableName,
            String conditions, String key, String... fieldNames) {

        job.setInputFormat(InfiniDBInputFormat.class);
        InfiniDBConfiguration dbConf = new InfiniDBConfiguration(job);
        dbConf.setInputClass(inputClass);
        dbConf.setInputTableName(tableName);
        dbConf.setInputFieldNames(fieldNames);
        dbConf.setInputConditions(conditions);
        dbConf.setSplitKey(key);
    }

    /**
     * @InfiniDB
     * Initializes the map-part of the job with the appropriate input settings.
     * 
     * @param job The job
     * @param inputClass the class object implementing DBWritable, which is the 
     * Java object holding tuple fields.
     * @param inputQuery the input query to select fields. Example : 
     * "SELECT f1, f2, f3 FROM Mytable ORDER BY f1"
     * @param inputCountQuery the input query that returns the number of records in
     * the table. 
     * Example : "SELECT COUNT(f1) FROM Mytable"
     * @see #setInput(JobConf, Class, String, String, String, String...)
     */
    public static void setInput(JobConf job, Class<? extends DBWritable> inputClass, String inputQuery,
            String inputCountQuery) {
        job.setInputFormat(InfiniDBInputFormat.class);

        InfiniDBConfiguration dbConf = new InfiniDBConfiguration(job);
        dbConf.setInputClass(inputClass);
        dbConf.setInputQuery(inputQuery);
        dbConf.setInputCountQuery(inputCountQuery);

    }

}