com.spotify.hdfs2cass.cassandra.cql.CrunchCqlBulkRecordWriter.java Source code

Introduction

Here is the source code for com.spotify.hdfs2cass.cassandra.cql.CrunchCqlBulkRecordWriter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * The modifications to the upstream file is Copyright 2014 Spotify AB.
 * The original upstream file can be found at
 * https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java
 */
package com.spotify.hdfs2cass.cassandra.cql;

import com.google.common.collect.Lists;
import com.google.common.util.concurrent.Uninterruptibles;
import com.spotify.hdfs2cass.cassandra.thrift.ProgressHeartbeat;
import com.spotify.hdfs2cass.cassandra.thrift.ProgressIndicator;
import com.spotify.hdfs2cass.crunch.CrunchConfigHelper;
import com.spotify.hdfs2cass.crunch.cql.CQLRecord;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.hadoop.AbstractBulkRecordWriter;
import org.apache.cassandra.hadoop.BulkRecordWriter;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.HadoopCompat;
import org.apache.cassandra.io.sstable.CQLSSTableWriter;
import org.apache.cassandra.io.sstable.SSTableLoader;
import org.apache.cassandra.streaming.StreamState;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.crunch.CrunchRuntimeException;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

/**
 * This is an almost-copy of {@link org.apache.cassandra.hadoop.cql3.CqlBulkRecordWriter}
 * <p>
 * We had to re-implement this class because of https://issues.apache.org/jira/browse/CASSANDRA-8367
 * </p>
 */
public class CrunchCqlBulkRecordWriter extends AbstractBulkRecordWriter<ByteBuffer, CQLRecord> {

    private static final Logger LOG = LoggerFactory.getLogger(CrunchCqlBulkRecordWriter.class);

    private String keyspace;
    private final ProgressHeartbeat heartbeat;

    private String columnFamily;
    private String schema;
    private String insertStatement;
    private File outputDir;

    public CrunchCqlBulkRecordWriter(TaskAttemptContext context) {
        super(context);
        setConfigs();
        heartbeat = new ProgressHeartbeat(context, 120);
    }

    private void setConfigs() {
        // if anything is missing, exceptions will be thrown here, instead of on write()
        keyspace = ConfigHelper.getOutputKeyspace(conf);
        columnFamily = CrunchConfigHelper.getOutputColumnFamily(conf);
        schema = CrunchCqlBulkOutputFormat.getColumnFamilySchema(conf, columnFamily);
        insertStatement = CrunchCqlBulkOutputFormat.getColumnFamilyInsertStatement(conf, columnFamily);
        outputDir = getColumnFamilyDirectory();
    }

    private void prepareWriter() {
        try {
            if (writer == null) {
                writer = CQLSSTableWriter.builder().forTable(schema).using(insertStatement)
                        .withPartitioner(ConfigHelper.getOutputPartitioner(conf)).inDirectory(outputDir).sorted()
                        .build();
            }
            if (loader == null) {
                CrunchExternalClient externalClient = new CrunchExternalClient(conf);
                externalClient.addKnownCfs(keyspace, schema);
                this.loader = new SSTableLoader(outputDir, externalClient,
                        new BulkRecordWriter.NullOutputHandler());
            }
        } catch (Exception e) {
            throw new CrunchRuntimeException(e);
        }
    }

    @Override
    public void write(final ByteBuffer ignoredKey, final CQLRecord record) {
        prepareWriter();
        // To ensure Crunch doesn't reuse CQLSSTableWriter's objects
        List<ByteBuffer> bb = Lists.newArrayList();
        for (ByteBuffer v : record.getValues()) {
            bb.add(ByteBufferUtil.clone(v));
        }
        try {
            ((CQLSSTableWriter) writer).rawAddRow(bb);
            if (null != progress)
                progress.progress();
            if (null != context)
                HadoopCompat.progress(context);
        } catch (InvalidRequestException | IOException e) {
            LOG.error(e.getMessage());
            throw new CrunchRuntimeException("Error adding row : " + e.getMessage());
        }
    }

    private File getColumnFamilyDirectory() {
        try {
            File dir = new File(String.format("%s%s%s%s%s", getOutputLocation(), File.separator, keyspace,
                    File.separator, columnFamily));
            if (!dir.exists() && !dir.mkdirs()) {
                throw new CrunchRuntimeException("Failed to created output directory: " + dir);
            }
            return dir;
        } catch (IOException e) {
            throw new CrunchRuntimeException(e);
        }
    }

    @Override
    public void close(TaskAttemptContext context) throws InterruptedException {
        close();
    }

    @Override
    @Deprecated
    public void close(org.apache.hadoop.mapred.Reporter reporter) {
        close();
    }

    private void close() {
        LOG.info("SSTables built. Now starting streaming");
        context.setStatus("streaming");
        heartbeat.startHeartbeat();
        try {
            if (writer != null) {
                writer.close();
                Future<StreamState> future = loader.stream(Collections.<InetAddress>emptySet(),
                        new ProgressIndicator());
                try {
                    StreamState streamState = Uninterruptibles.getUninterruptibly(future);
                    if (streamState.hasFailedSession()) {
                        LOG.warn("Some streaming sessions failed");
                    } else {
                        LOG.info("Streaming finished successfully");
                    }
                } catch (ExecutionException e) {
                    throw new CrunchRuntimeException(
                            "Streaming to the following hosts failed: " + loader.getFailedHosts(), e);
                }
            } else {
                LOG.info("SSTableWriter wasn't instantiated, no streaming happened.");
            }
        } catch (IOException e) {
            throw new CrunchRuntimeException(e);
        } finally {
            heartbeat.stopHeartbeat();
        }
    }
}