com.redgate.hadoop.hive.azuretables.AzureTablesInputFormat.java Source code

Introduction

Here is the source code for com.redgate.hadoop.hive.azuretables.AzureTablesInputFormat.java
Source

/*******************************************************************************
 * Copyright 2013 Simon Elliston Ball
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.redgate.hadoop.hive.azuretables;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

/**
 * A HiveInputFormat implementation to allow Azure Tables to back Hive External
 * tables
 * 
 * @author Simon Elliston Ball <simon@simonellistonball.com>
 * 
 */
public class AzureTablesInputFormat extends HiveInputFormat<Text, MapWritable> {
    public static final Log LOG = LogFactory.getLog(AzureTablesInputFormat.class);

    /**
     * Sets up a RecordReader for an Azure Table, and build a connection string
     * from the table properties.
     */
    @Override
    public RecordReader<Text, MapWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
            throws IOException {
        String table = conf.get(ConfigurationUtil.TABLE);
        String accountName = ConfigurationUtil.accountName(conf);
        String storageConnectionString = ConfigurationUtil.getStorageConnectionString(conf);
        LOG.info(String.format("Connecting to table %s on account %s", table, accountName));
        return new AzureTablesRecordReader(storageConnectionString, table, split);
    }

    /**
     * Determine InputSplits for the Azure Table reader, using the partition
     * keys given in the table definition as the input split boundaries.
     * 
     * @param conf
     *            The Hadoop job configuration
     * @param numSplits
     *            The desired number of splits, which is pretty much ignored at
     *            the moment. This is not ideal, since there should be a
     *            secondary mechanism to further split a table partition
     */
    @Override
    public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
        LOG.info("Partition keys: " + conf.get(ConfigurationUtil.PARTITION_KEYS));
        String[] partitionKeys = ConfigurationUtil.partitionKeys(conf);
        return AzureTablesSplit.getSplits(conf, partitionKeys, numSplits);
    }
}