org.schedoscope.export.jdbc.outputformat.JdbcOutputFormat.java Source code

Introduction

Here is the source code for org.schedoscope.export.jdbc.outputformat.JdbcOutputFormat.java
Source

/**
 * Copyright 2016 Otto (GmbH & Co KG)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.schedoscope.export.jdbc.outputformat;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;

import org.apache.commons.dbutils.DbUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.schedoscope.export.jdbc.exception.RetryException;
import org.schedoscope.export.jdbc.exception.UnrecoverableException;
import org.schedoscope.export.jdbc.outputschema.Schema;
import org.schedoscope.export.jdbc.outputschema.SchemaFactory;
import org.schedoscope.export.utils.JdbcQueryUtils;

/**
 * The JDBC output format is responsible to write data into a database using
 * JDBC connection.
 *
 * @param <K>
 *            The key class.
 * @param <V>
 *            The value class.
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class JdbcOutputFormat<K, V extends DBWritable> extends OutputFormat<K, V> {

    private static final Log LOG = LogFactory.getLog(JdbcOutputFormat.class);

    private static final String TMPDB = "TMP_";

    @Override
    public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
    }

    @Override
    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {

        return new FileOutputCommitter(FileOutputFormat.getOutputPath(context), context);
    }

    /**
     * The JDBC Record Writer is used to write data into a database using a JDBC
     * connection.
     */
    @InterfaceStability.Evolving
    public class JdbcRecordWriter extends RecordWriter<K, V> {

        private Connection connection;
        private PreparedStatement statement;
        private int rowsInBatch = 0;
        private int rowsTotal = 0;
        private int commitSize = 25000;

        public JdbcRecordWriter() throws SQLException {
        }

        /**
         * The constructor to initialize the JDBC Record Writer.
         *
         * @param connection
         *            The JDBC connection.
         * @param statement
         *            The prepared statement.
         * @param commitSize
         *            The batch size
         * @throws SQLException
         *             Is thrown if a error occurs.
         */
        public JdbcRecordWriter(Connection connection, PreparedStatement statement, int commitSize)
                throws SQLException {

            this.connection = connection;
            this.statement = statement;
            this.commitSize = commitSize;
            this.connection.setAutoCommit(false);
        }

        public Connection getConnection() {

            return connection;
        }

        public PreparedStatement getStatement() {

            return statement;
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException {

            try {

                if (rowsInBatch > 0) {
                    statement.executeBatch();
                }

                if (rowsTotal > 0) {
                    connection.commit();
                }

            } catch (SQLException e) {
                try {
                    connection.rollback();
                } catch (SQLException ex) {
                    LOG.warn(StringUtils.stringifyException(ex));
                }
                throw new IOException(e.getMessage());
            } finally {
                DbUtils.closeQuietly(statement);
                DbUtils.closeQuietly(connection);
            }
        }

        @Override
        public void write(K key, V value) throws IOException {

            try {
                value.write(statement);
                statement.addBatch();
                if (rowsInBatch == commitSize) {
                    statement.executeBatch();
                    rowsInBatch = 0;
                } else {
                    rowsInBatch++;
                    rowsTotal++;
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

    @Override
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException {

        Schema outputSchema = SchemaFactory.getSchema(context.getConfiguration());

        String tmpOutputTable = getTablePrefix(outputSchema) + outputSchema.getTable() + "_"
                + context.getTaskAttemptID().getTaskID().getId();
        String createTableQuery = outputSchema.getCreateTableQuery();

        createTableQuery = createTableQuery.replace(outputSchema.getTable(), tmpOutputTable);

        int commitSize = outputSchema.getCommitSize();
        String[] fieldNames = outputSchema.getColumnNames();

        try {
            Connection connection = outputSchema.getConnection();

            JdbcQueryUtils.dropTable(tmpOutputTable, connection);
            JdbcQueryUtils.createTable(createTableQuery, connection);

            PreparedStatement statement = null;
            statement = connection.prepareStatement(JdbcQueryUtils.createInsertQuery(tmpOutputTable, fieldNames));

            return new JdbcRecordWriter(connection, statement, commitSize);

        } catch (Exception ex) {
            throw new IOException(ex.getMessage());
        }
    }

    private static String getTablePrefix(Schema outputSchema) {
        if (outputSchema.getFilter() != null) {
            String filter = outputSchema.getFilter().replace("=", "");
            return TMPDB + filter + "_";
        } else {
            return TMPDB;
        }
    }

    /**
     * Initializes the JDBCOutputFormat.
     *
     * @param conf
     *            The Hadoop configuration object.
     * @param connectionString
     *            The JDBC connection string.
     * @param username
     *            The database user name
     * @param password
     *            The database password
     * @param outputTable
     *            The output table
     * @param inputFilter
     *            The input filter
     * @param outputNumberOfPartitions
     *            The number of partitions / reducers
     * @param outputCommitSize
     *            The batch size
     * @param storageEngine
     *            The storage engine, either MyISAM or InnoDB (MySQL)
     * @param distributedBy
     *            An optional distribute by clause (Exasol)
     * @param columnNames
     *            The column names.
     * @param columnsTypes
     *            The column types.
     * @throws IOException
     *             Is thrown if an error occurs.
     */
    public static void setOutput(Configuration conf, String connectionString, String username, String password,
            String outputTable, String inputFilter, int outputNumberOfPartitions, int outputCommitSize,
            String storageEngine, String distributedBy, String[] columnNames, String[] columnsTypes)
            throws IOException {

        Schema outputSchema = SchemaFactory.getSchema(connectionString, conf);
        outputSchema.setOutput(connectionString, username, password, outputTable, inputFilter,
                outputNumberOfPartitions, outputCommitSize, storageEngine, distributedBy, columnNames,
                columnsTypes);
    }

    /**
     * This function finalizes the JDBC export, it merges all partitions and
     * drops the temporary tables, optionally updates the output table.
     *
     * @param conf
     *            The Hadoop configuration object.
     * @throws RetryException
     *             Is thrown if a SQL error occurs.
     * @throws UnrecoverableException
     *             Is thrown if JDBC driver issue occurs.
     */
    public static void finalizeOutput(Configuration conf) throws RetryException, UnrecoverableException {

        Schema outputSchema = SchemaFactory.getSchema(conf);
        String outputTable = outputSchema.getTable();
        String tmpOutputTable = getTablePrefix(outputSchema) + outputSchema.getTable();
        String createTableStatement = outputSchema.getCreateTableQuery();
        String inputFilter = outputSchema.getFilter();
        int outputNumberOfPartitions = outputSchema.getNumberOfPartitions();

        Connection connection = null;

        try {
            connection = outputSchema.getConnection();

            if (inputFilter != null) {
                JdbcQueryUtils.deleteExisitingRows(outputTable, inputFilter, connection);
            } else {
                JdbcQueryUtils.dropTable(outputTable, connection);
            }

            JdbcQueryUtils.createTable(createTableStatement, connection);
            JdbcQueryUtils.mergeOutput(outputTable, getTablePrefix(outputSchema), outputNumberOfPartitions,
                    connection);
            JdbcQueryUtils.dropTemporaryOutputTables(tmpOutputTable, outputNumberOfPartitions, connection);

        } catch (SQLException ex1) {
            LOG.error(ex1.getMessage());
            throw new RetryException(ex1.getMessage());
        } catch (ClassNotFoundException ex2) {
            LOG.error(ex2.getMessage());
            throw new UnrecoverableException(ex2.getMessage());
        } finally {
            DbUtils.closeQuietly(connection);
        }
    }

    /**
     * This function is called if the MR job doesn't finish successfully.
     *
     * @param conf
     *            The Hadoop configuration object.
     * @throws RetryException
     *             Is thrown if a SQL error occurs.
     * @throws UnrecoverableException
     *             Is thrown if JDBC driver issue occurs.
     */
    public static void rollback(Configuration conf) throws RetryException, UnrecoverableException {

        Schema outputSchema = SchemaFactory.getSchema(conf);
        String tmpOutputTable = getTablePrefix(outputSchema) + outputSchema.getTable();
        int outputNumberOfPartitions = outputSchema.getNumberOfPartitions();

        Connection connection = null;

        try {
            connection = outputSchema.getConnection();
            JdbcQueryUtils.dropTemporaryOutputTables(tmpOutputTable, outputNumberOfPartitions, connection);

        } catch (SQLException ex1) {
            LOG.error(ex1.getMessage());
            throw new RetryException(ex1.getMessage());
        } catch (ClassNotFoundException ex2) {
            LOG.error(ex2.getMessage());
            throw new UnrecoverableException(ex2.getMessage());
        } finally {
            DbUtils.closeQuietly(connection);
        }
    }
}