Java tutorial
/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce; import static com.google.common.base.Preconditions.checkNotNull; import java.io.IOException; import java.util.Iterator; import java.util.List; import com.google.common.collect.Lists; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.SerializationUtils; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.mapreduce.TableSplit; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.kiji.schema.EntityId; import org.kiji.schema.Kiji; import org.kiji.schema.KijiDataRequest; import org.kiji.schema.KijiRowData; import org.kiji.schema.KijiRowScanner; import org.kiji.schema.KijiTable; import org.kiji.schema.KijiTableReader; import org.kiji.schema.KijiURI; import org.kiji.schema.KijiURIException; import org.kiji.schema.impl.DefaultHBaseFactory; import org.kiji.schema.impl.HBaseEntityId; import org.kiji.schema.impl.HBaseKijiRowData; import org.kiji.schema.impl.HBaseKijiTable; /** InputFormat for Hadoop MapReduce jobs reading from a Kiji table. */ public class KijiTableInputFormat extends InputFormat<EntityId, KijiRowData> implements Configurable { private static final Logger LOG = LoggerFactory.getLogger(KijiTableInputFormat.class); /** Configuration of this input format. */ private Configuration mConf; /** {@inheritDoc} */ @Override public void setConf(Configuration conf) { mConf = conf; } /** {@inheritDoc} */ @Override public Configuration getConf() { return mConf; } /** {@inheritDoc} */ @Override public RecordReader<EntityId, KijiRowData> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { return new KijiTableRecordReader(mConf); } /** {@inheritDoc} */ @Override public List<InputSplit> getSplits(JobContext context) throws IOException { final Configuration conf = context.getConfiguration(); final KijiURI inputTableURI = getInputTableURI(conf); final Kiji kiji = Kiji.open(inputTableURI, conf); final KijiTable table = kiji.openTable(inputTableURI.getTable()); final HBaseAdmin admin = DefaultHBaseFactory.Provider.get().getHBaseAdminFactory(inputTableURI) .create(conf); final HTableInterface htable = HBaseKijiTable.downcast(table).getHTable(); final List<HRegionInfo> regions = admin.getTableRegions(htable.getTableName()); final List<InputSplit> splits = Lists.newArrayList(); for (HRegionInfo region : regions) { final byte[] startKey = region.getStartKey(); final TableSplit tableSplit = new TableSplit(htable.getTableName(), startKey, region.getEndKey(), table.getName()); splits.add(new KijiTableSplit(tableSplit, startKey)); } return splits; } /** * Configures a Hadoop M/R job to read from a given table. * * @param job Job to configure. * @param tableURI URI of the table to read from. * @param dataRequest Data request. * @param startRow Minimum row key to process. * @param endRow Maximum row Key to process. * @throws IOException on I/O error. */ public static void configureJob(Job job, KijiURI tableURI, KijiDataRequest dataRequest, String startRow, String endRow) throws IOException { final Configuration conf = job.getConfiguration(); final Kiji kiji = Kiji.open(tableURI, conf); final KijiTable table = kiji.openTable(tableURI.getTable()); IOUtils.closeQuietly(table); IOUtils.closeQuietly(kiji); // TODO: Check for jars config: // GenericTableMapReduceUtil.initTableInput(hbaseTableName, scan, job); // TODO: Obey specified start/end rows. // Write all the required values to the job's configuration object. job.setInputFormatClass(KijiTableInputFormat.class); final String serializedRequest = Base64.encodeBase64String(SerializationUtils.serialize(dataRequest)); conf.set(KijiConfKeys.INPUT_DATA_REQUEST, serializedRequest); conf.set(KijiConfKeys.INPUT_TABLE_URI, tableURI.toString()); } /** Hadoop record reader for Kiji table rows. */ public static class KijiTableRecordReader extends RecordReader<EntityId, KijiRowData> { /** Data request. */ protected final KijiDataRequest mDataRequest; /** Hadoop Configuration object containing settings. */ protected final Configuration mConf; private Kiji mKiji = null; private KijiTable mTable = null; private KijiTableReader mReader = null; private KijiRowScanner mScanner = null; private Iterator<KijiRowData> mIterator = null; private KijiTableSplit mSplit = null; private HBaseKijiRowData mCurrentRow = null; /** * Creates a new RecordReader for this input format. This RecordReader will perform the actual * reads from Kiji. * * @param conf The configuration object for this Kiji. */ public KijiTableRecordReader(Configuration conf) { mConf = conf; // Get data request from the job configuration. final String dataRequestB64 = checkNotNull(mConf.get(KijiConfKeys.INPUT_DATA_REQUEST), "Missing data request in job configuration."); final byte[] dataRequestBytes = Base64.decodeBase64(Bytes.toBytes(dataRequestB64)); mDataRequest = (KijiDataRequest) SerializationUtils.deserialize(dataRequestBytes); } /** {@inheritDoc} */ @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { assert split instanceof KijiTableSplit; mSplit = (KijiTableSplit) split; final Configuration conf = context.getConfiguration(); final KijiURI inputURI = getInputTableURI(conf); mKiji = Kiji.open(inputURI, conf); mTable = mKiji.openTable(inputURI.getTable()); mReader = mTable.openTableReader(); mScanner = mReader.getScanner(mDataRequest, new HBaseEntityId(mSplit.getStartRow()), new HBaseEntityId(mSplit.getEndRow())); mIterator = mScanner.iterator(); mCurrentRow = null; } /** {@inheritDoc} */ @Override public EntityId getCurrentKey() throws IOException { return mCurrentRow.getEntityId(); } /** {@inheritDoc} */ @Override public KijiRowData getCurrentValue() throws IOException { return mCurrentRow; } /** {@inheritDoc} */ @Override public float getProgress() throws IOException { // TODO: Implement return 0.0f; } /** {@inheritDoc} */ @Override public boolean nextKeyValue() throws IOException { if (mIterator.hasNext()) { mCurrentRow = (HBaseKijiRowData) mIterator.next(); return true; } else { mCurrentRow = null; return false; } } /** {@inheritDoc} */ @Override public void close() throws IOException { IOUtils.closeQuietly(mScanner); IOUtils.closeQuietly(mReader); IOUtils.closeQuietly(mTable); IOUtils.closeQuietly(mKiji); mIterator = null; mScanner = null; mReader = null; mTable = null; mKiji = null; mSplit = null; mCurrentRow = null; } } /** * Reports the URI of the configured input table. * * @param conf Read the input URI from this configuration. * @return the configured input URI. * @throws IOException on I/O error. */ private static KijiURI getInputTableURI(Configuration conf) throws IOException { try { return KijiURI.parse(conf.get(KijiConfKeys.INPUT_TABLE_URI)); } catch (KijiURIException kue) { throw new IOException(kue); } } }