Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package com.tuplejump.calliope.hadoop.cql3;

import com.datastax.driver.core.*;
import com.datastax.driver.core.exceptions.NoHostAvailableException;
import com.tuplejump.calliope.hadoop.ColumnFamilySplit;
import com.tuplejump.calliope.hadoop.ConfigHelper;
import com.tuplejump.calliope.hadoop.MultiRangeSplit;
import com.tuplejump.calliope.hadoop.TokenRangeHolder;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.BytesType;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.utils.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.*;

 * CqlRecordReader reads the rows return from the CQL query
 * It uses CQL auto-paging.
 * <p/>
 * Return a Long as a local CQL row key starts from 0;
 * <p/>
 * Row as C* java driver CQL result set row
 * 1) select clause must include partition key columns (to calculate the progress based on the actual CF row processed)
 * 2) where clause must include token(partition_key1, ...  , partition_keyn) > ? and
 * token(partition_key1, ... , partition_keyn) <= ?  (in the right order)
public class CqlRecordReader extends RecordReader<Long, Row>
        implements org.apache.hadoop.mapred.RecordReader<Long, Row> {
    private static final Logger logger = LoggerFactory.getLogger(CqlRecordReader.class);
    private InputSplit split;
    private RowIterator rowIterator;

    private Pair<Long, Row> currentRow;
    private int totalRowCount; // total number of rows to fetch
    private String keyspace;
    private String cfName;
    private String cqlQuery;
    private Cluster cluster;
    private Session session;
    private IPartitioner partitioner;

    // partition keys -- key aliases
    private LinkedHashMap<String, Boolean> partitionBoundColumns = Maps.newLinkedHashMap();

    public CqlRecordReader() {
        super();"Creating CQL Record Reader");

    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
        if (CqlConfigHelper.getMultiRangeInputSplit(context.getConfiguration())) {
  "Initializing Record reader with MultiRangeSplit");
            initializeWithMultiRangeSplit(split, context);
        } else {
  "Initializing Record reader with SingleRangeSplit");
            initializeWithColumnFamilySplit(split, context);

    private void initializeWithColumnFamilySplit(InputSplit split, TaskAttemptContext context) throws IOException {
        this.split = split;
        ColumnFamilySplit cfSplit = (ColumnFamilySplit) split;
        Configuration conf = context.getConfiguration();
        totalRowCount = (cfSplit.getLength() < Long.MAX_VALUE) ? (int) cfSplit.getLength()
                : ConfigHelper.getInputSplitSize(conf);
        cfName = quote(ConfigHelper.getInputColumnFamily(conf));
        keyspace = quote(ConfigHelper.getInputKeyspace(conf));
        cqlQuery = CqlConfigHelper.getInputCql(conf);
        partitioner = ConfigHelper.getInputPartitioner(context.getConfiguration());

        try {
            if (cluster != null)
            // create connection using thrift
            String[] locations = split.getLocations();

            Exception lastException = null;
            for (String location : locations) {
                try {
                    cluster = CqlConfigHelper.getInputCluster(location, conf);
                } catch (Exception e) {
                    lastException = e;
                    logger.warn("Failed to create authenticated client to {}", location);
            if (cluster == null && lastException != null)
                throw lastException;
        } catch (Exception e) {
            throw new RuntimeException(e);

        if (cluster != null) {
            try {
                session = cluster.connect(keyspace);
            } catch (NoHostAvailableException nha) {
                Map<InetSocketAddress, Throwable> errors = nha.getErrors();
                for (InetSocketAddress isa : errors.keySet()) {
                    logger.error("ERROR ON HOST [" + isa.getAddress() + "/" + isa.getPort() + "] ");
                    logger.error("Connection Timeout:  "
                            + cluster.getConfiguration().getSocketOptions().getConnectTimeoutMillis());
                    logger.error("Local connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                    logger.error("Remote connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                    //logger.error("Connection Timeout:  " + cluster.getConfiguration().getSocketOptions().);
                throw nha;
        rowIterator = new SingleRangeRowIterator();
        logger.debug("created {}", rowIterator);

    private void initializeWithMultiRangeSplit(InputSplit split, TaskAttemptContext context) throws IOException {
        this.split = split;
        MultiRangeSplit cfSplit = (MultiRangeSplit) split;
        Configuration conf = context.getConfiguration();
        totalRowCount = (cfSplit.getLength() < Long.MAX_VALUE) ? (int) cfSplit.getLength()
                : ConfigHelper.getInputSplitSize(conf);
        cfName = quote(ConfigHelper.getInputColumnFamily(conf));
        keyspace = quote(ConfigHelper.getInputKeyspace(conf));
        cqlQuery = CqlConfigHelper.getInputCql(conf);
        partitioner = ConfigHelper.getInputPartitioner(context.getConfiguration());

        try {
            if (cluster != null)
            // create connection using thrift
            String[] locations = split.getLocations();

            Exception lastException = null;
            for (String location : locations) {
                try {
                    cluster = CqlConfigHelper.getInputCluster(location, conf);
                } catch (Exception e) {
                    lastException = e;
                    logger.warn("Failed to create authenticated client to {}", location);
            if (cluster == null && lastException != null)
                throw lastException;
        } catch (Exception e) {
            throw new RuntimeException(e);

        if (cluster != null) {
            try {
                session = cluster.connect(keyspace);
            } catch (NoHostAvailableException nha) {
                Map<InetSocketAddress, Throwable> errors = nha.getErrors();
                for (InetSocketAddress isa : errors.keySet()) {
                    logger.error("ERROR ON HOST [" + isa.getAddress() + "/" + isa.getPort() + "] ");
                    logger.error("Connection Timeout:  "
                            + cluster.getConfiguration().getSocketOptions().getConnectTimeoutMillis());
                    logger.error("Local connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                    logger.error("Remote connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                    //logger.error("Connection Timeout:  " + cluster.getConfiguration().getSocketOptions().);
                throw nha;
        rowIterator = new MultiRangeRowIterator();
        logger.debug("created {}", rowIterator);

    public void close() {
        if (session != null)
        if (cluster != null)

    public Long getCurrentKey() {
        return currentRow.left;

    public Row getCurrentValue() {
        return currentRow.right;

    public float getProgress() {
        if (!rowIterator.hasNext())
            return 1.0F;

        // the progress is likely to be reported slightly off the actual but close enough
        float progress = ((float) rowIterator.totalRead / totalRowCount);
        return progress > 1.0F ? 1.0F : progress;

    public boolean nextKeyValue() throws IOException {
        if (!rowIterator.hasNext()) {
            logger.debug("Finished scanning {} rows (estimate was: {})", rowIterator.totalRead, totalRowCount);
            return false;

        try {
            currentRow =;
        } catch (Exception e) {
            // throw it as IOException, so client can catch it and handle it at client side
            IOException ioe = new IOException(e.getMessage());
            throw ioe;
        return true;

    // Because the old Hadoop API wants us to write to the key and value
    // and the new asks for them, we need to copy the output of the new API
    // to the old. Thus, expect a small performance hit.
    // And obviously this wouldn't work for wide rows. But since ColumnFamilyInputFormat
    // and ColumnFamilyRecordReader don't support them, it should be fine for now.
    public boolean next(Long key, Row value) throws IOException {
        if (nextKeyValue()) {
            ((WrappedRow) value).setRow(getCurrentValue());
            return true;
        return false;

    public long getPos() throws IOException {
        return (long) rowIterator.totalRead;

    public Long createKey() {
        return new Long(0L);

    public Row createValue() {
        return new WrappedRow();

    private abstract class RowIterator extends AbstractIterator<Pair<Long, Row>> {
        protected int totalRead = 0; // total number of cf rows read

     * CQL row iterator
     * Input cql query
     * 1) select clause must include key columns (if we use partition key based row count)
     * 2) where clause must include token(partition_key1 ... partition_keyn) > ? and
     * token(partition_key1 ... partition_keyn) <= ?
    private class SingleRangeRowIterator extends RowIterator {
        private long keyId = 0L;
        protected Iterator<Row> rows;
        private Map<String, ByteBuffer> previousRowKey = new HashMap<String, ByteBuffer>(); // previous CF row key

        public SingleRangeRowIterator() {
            ColumnFamilySplit cfSplit = (ColumnFamilySplit) split;
            if (session == null)
                throw new RuntimeException("Can't create connection session");

            AbstractType type = partitioner.getTokenValidator();

            if (logger.isDebugEnabled()) {
                logger.debug("QUERY: " + cqlQuery);
                logger.debug("START: " + cfSplit.getStartToken());
                logger.debug("END: " + cfSplit.getEndToken());

            ResultSet rs = session.execute(cqlQuery, type.compose(type.fromString(cfSplit.getStartToken())),
            for (ColumnMetadata meta : cluster.getMetadata().getKeyspace(keyspace).getTable(cfName)
                partitionBoundColumns.put(meta.getName(), Boolean.TRUE);
            rows = rs.iterator();

        protected Pair<Long, Row> computeNext() {
            if (rows == null || !rows.hasNext())
                return endOfData();

            Row row =;
            Map<String, ByteBuffer> keyColumns = new HashMap<String, ByteBuffer>();
            for (String column : partitionBoundColumns.keySet())
                keyColumns.put(column, row.getBytesUnsafe(column));

            // increase total CF row read
            if (previousRowKey.isEmpty() && !keyColumns.isEmpty()) {
                previousRowKey = keyColumns;
            } else {
                for (String column : partitionBoundColumns.keySet()) {
                    if (BytesType.bytesCompare(keyColumns.get(column), previousRowKey.get(column)) != 0) {
                        previousRowKey = keyColumns;
            return Pair.create(keyId, row);

     * CQL row iterator
     * Input cql query
     * 1) select clause must include key columns (if we use partition key based row count)
     * 2) where clause must include token(partition_key1 ... partition_keyn) > ? and
     * token(partition_key1 ... partition_keyn) <= ?
    private class MultiRangeRowIterator extends RowIterator {
        private long keyId = 0L;
        protected Iterator<Row> currentRangeRows;
        private TokenRangeHolder[] tokenRanges;
        private int currentRange;
        private int currentRow = 0;
        private Long startTime = System.nanoTime();
        private Map<String, ByteBuffer> previousRowKey = new HashMap<String, ByteBuffer>(); // previous CF row key
        AbstractType validatorType;

        public MultiRangeRowIterator() {
            MultiRangeSplit cfSplit = (MultiRangeSplit) split;
            if (session == null)
                throw new RuntimeException("Can't create connection session");

            validatorType = partitioner.getTokenValidator();

            if (logger.isDebugEnabled()) {
                logger.debug("QUERY: " + cqlQuery);
                logger.debug("Multi Range length is " + cfSplit.getLength());

  "Created new MultiRangeRowIterator");
            tokenRanges = cfSplit.getTokenRanges();
            currentRange = 0;

        private Iterator<Row> getNextRange() {
            if (logger.isDebugEnabled())
                logger.debug(String.format("Processing new token range. %d more to go!",
                        tokenRanges.length - currentRange));

            TokenRangeHolder range = tokenRanges[currentRange];

            Object startToken = validatorType.compose(validatorType.fromString(range.getStartToken()));
            Object endToken = validatorType.compose(validatorType.fromString(range.getEndToken()));

            logger.debug("Fetching rows with Query: " + cqlQuery + " in range start: [" + startToken
                    + "] + and end: [" + endToken + "]");

            ResultSet rs = session.execute(cqlQuery, startToken, endToken);
            for (ColumnMetadata meta : cluster.getMetadata().getKeyspace(keyspace).getTable(cfName)
                partitionBoundColumns.put(meta.getName(), Boolean.TRUE);


            return rs.iterator();

        private Row getNextRow() {
            if ((currentRangeRows == null || !currentRangeRows.hasNext()) && tokenRanges.length > currentRange) {
                do {
                    currentRangeRows = getNextRange();
                } while (!currentRangeRows.hasNext() && tokenRanges.length > currentRange);

            if (currentRangeRows == null) {
                return null;
            } else {

        protected Pair<Long, Row> computeNext() {
            Row row = getNextRow();

            if (row == null) {
      "Processed {} rows in {} token ranges from {} assigned ranges", currentRow,
                        currentRange, tokenRanges.length);
      "in {} nano seconds", System.nanoTime() - startTime);
      "Done processing all ranges!");
                return endOfData();

            if (logger.isDebugEnabled()) {
                logger.debug(String.format("Got new row. Row # %d of total # %d", totalRead, totalRowCount));

            Map<String, ByteBuffer> keyColumns = new HashMap<String, ByteBuffer>();
            for (String column : partitionBoundColumns.keySet())
                keyColumns.put(column, row.getBytesUnsafe(column));

            // increase total CF row read
            if (previousRowKey.isEmpty() && !keyColumns.isEmpty()) {
                previousRowKey = keyColumns;
            } else {
                for (String column : partitionBoundColumns.keySet()) {
                    if (BytesType.bytesCompare(keyColumns.get(column), previousRowKey.get(column)) != 0) {
                        previousRowKey = keyColumns;
            return Pair.create(keyId, row);

    private static class WrappedRow implements Row {
        private Row row;

        public void setRow(Row row) {
            this.row = row;

        public ColumnDefinitions getColumnDefinitions() {
            return row.getColumnDefinitions();

        public boolean isNull(int i) {
            return row.isNull(i);

        public boolean isNull(String name) {
            return row.isNull(name);

        public boolean getBool(int i) {
            return row.getBool(i);

        public boolean getBool(String name) {
            return row.getBool(name);

        public int getInt(int i) {
            return row.getInt(i);

        public int getInt(String name) {
            return row.getInt(name);

        public long getLong(int i) {
            return row.getLong(i);

        public long getLong(String name) {
            return row.getLong(name);

        public Date getDate(int i) {
            return row.getDate(i);

        public Date getDate(String name) {
            return row.getDate(name);

        public float getFloat(int i) {
            return row.getFloat(i);

        public float getFloat(String name) {
            return row.getFloat(name);

        public double getDouble(int i) {
            return row.getDouble(i);

        public double getDouble(String name) {
            return row.getDouble(name);

        public ByteBuffer getBytesUnsafe(int i) {
            return row.getBytesUnsafe(i);

        public ByteBuffer getBytesUnsafe(String name) {
            return row.getBytesUnsafe(name);

        public ByteBuffer getBytes(int i) {
            return row.getBytes(i);

        public ByteBuffer getBytes(String name) {
            return row.getBytes(name);

        public String getString(int i) {
            return row.getString(i);

        public String getString(String name) {
            return row.getString(name);

        public BigInteger getVarint(int i) {
            return row.getVarint(i);

        public BigInteger getVarint(String name) {
            return row.getVarint(name);

        public BigDecimal getDecimal(int i) {
            return row.getDecimal(i);

        public BigDecimal getDecimal(String name) {
            return row.getDecimal(name);

        public UUID getUUID(int i) {
            return row.getUUID(i);

        public UUID getUUID(String name) {
            return row.getUUID(name);

        public InetAddress getInet(int i) {
            return row.getInet(i);

        public InetAddress getInet(String name) {
            return row.getInet(name);

        public <T> List<T> getList(int i, Class<T> elementsClass) {
            return row.getList(i, elementsClass);

        public <T> List<T> getList(String name, Class<T> elementsClass) {
            return row.getList(name, elementsClass);

        public <T> Set<T> getSet(int i, Class<T> elementsClass) {
            return row.getSet(i, elementsClass);

        public <T> Set<T> getSet(String name, Class<T> elementsClass) {
            return row.getSet(name, elementsClass);

        public <K, V> Map<K, V> getMap(int i, Class<K> keysClass, Class<V> valuesClass) {
            return row.getMap(i, keysClass, valuesClass);

        public <K, V> Map<K, V> getMap(String name, Class<K> keysClass, Class<V> valuesClass) {
            return row.getMap(name, keysClass, valuesClass);

    private String quote(String identifier) {
        return "\"" + identifier.replaceAll("\"", "\"\"") + "\"";