com.google.cloud.sparkdemo.CloudPubsubReceiver.java Source code

Java tutorial

Introduction

Here is the source code for com.google.cloud.sparkdemo.CloudPubsubReceiver.java

Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 * This is not an official Google product.
 */

package com.google.cloud.sparkdemo;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.util.List;
import java.util.stream.Collectors;

import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.client.googleapis.util.Utils;
import com.google.api.client.http.HttpRequestInitializer;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.services.pubsub.Pubsub;
import com.google.api.services.pubsub.PubsubScopes;
import com.google.api.services.pubsub.model.*;

import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.receiver.Receiver;

public class CloudPubsubReceiver extends Receiver<String> {
    // 429 is not supported in either java.net.HttpURLConnection, org.apache.http.HttpStatus
    // or com.google.api.client.http.HttpStatusCodes;
    // TODO: Add additional codes to com.google.api.client.http.HttpStatusCodes package
    private final int HTTP_TOO_MANY_REQUESTS = 429;
    // Backoff time when pubsub is throttled
    private final int MIN_BACKOFF_SECONDS = 1;
    private final int MAX_BACKOFF_SECONDS = 64;
    // Maximum # of messages in each Pub/sub Pull request
    private final int BATCH_SIZE = 500;

    private String projectFullName;
    private String topicFullName;
    private String subscriptionFullName;

    public CloudPubsubReceiver(String projectName, String topicName, String subscriptionName) {
        super(StorageLevel.MEMORY_AND_DISK_2());
        this.projectFullName = "projects/" + projectName;
        this.topicFullName = projectFullName + "/topics/" + topicName;
        this.subscriptionFullName = projectFullName + "/subscriptions/" + subscriptionName;
    }

    public void onStart() {
        Pubsub client = createAuthorizedClient();
        Subscription subscription = new Subscription().setTopic(topicFullName);
        try {
            // Create a subscription if it does not exist.
            subscription = client.projects().subscriptions().create(subscriptionFullName, subscription).execute();
        } catch (GoogleJsonResponseException e) {
            if (e.getDetails().getCode() == HttpURLConnection.HTTP_CONFLICT) {
                // Subscription already exists, but that's the expected behavior with multiple receivers.
            } else {
                reportSubscriptionCreationError(e);
            }
        } catch (IOException e) {
            reportSubscriptionCreationError(e);
        }

        // Start the thread that receives data over a connection
        // TODO: start a threadpool instead
        new Thread() {
            @Override
            public void run() {
                receive();
            }
        }.start();
    }

    public void onStop() {
        // Delete the subscription
        try {
            Pubsub client = createAuthorizedClient();
            client.projects().subscriptions().delete(subscriptionFullName).execute();
        } catch (GoogleJsonResponseException e) {
            if (e.getDetails().getCode() == HttpURLConnection.HTTP_NOT_FOUND) {
                // Subscription may has already been deleted, but that's the expected behavior
                // with multiple receivers.
            } else {
                reportSubscriptionDeleteionError(e);
            }
        } catch (IOException e) {
            reportSubscriptionDeleteionError(e);
        }
    }

    // Pull messages from Pubsub and store as RDD.
    private void receive() {
        Pubsub client = createAuthorizedClient();
        PullRequest pullRequest = new PullRequest().setReturnImmediately(false).setMaxMessages(BATCH_SIZE);

        int backoffTimeSeconds = MIN_BACKOFF_SECONDS;
        do {
            try {
                PullResponse pullResponse = client.projects().subscriptions()
                        .pull(subscriptionFullName, pullRequest).execute();

                List<ReceivedMessage> receivedMessages = pullResponse.getReceivedMessages();
                if (receivedMessages != null) {
                    // Store the message contents in batch
                    store(receivedMessages.stream().filter(m -> m.getMessage() != null)
                            .filter(m -> m.getMessage().decodeData() != null).map(m -> {
                                try {
                                    return new String(m.getMessage().decodeData(), "UTF-8");
                                } catch (UnsupportedEncodingException e) {
                                    // Wrong encode
                                    return null;
                                }
                            }).filter(m -> m != null).iterator());

                    AcknowledgeRequest ackRequest = new AcknowledgeRequest();
                    ackRequest.setAckIds(
                            receivedMessages.stream().map(ReceivedMessage::getAckId).collect(Collectors.toList()));
                    client.projects().subscriptions().acknowledge(subscriptionFullName, ackRequest).execute();
                    // Reset backoff time
                    backoffTimeSeconds = MIN_BACKOFF_SECONDS;
                }
            } catch (GoogleJsonResponseException e) {
                if (e.getDetails().getCode() == HTTP_TOO_MANY_REQUESTS) {
                    // When PubSub is rate throttled, retry with exponential backoff.
                    // TODO: Extract "retry-after" vaule from Http response, if available.
                    reportError("Reading from subscription " + subscriptionFullName
                            + " is throttled. Will retry after " + backoffTimeSeconds + " seconds.", e);
                    wait(backoffTimeSeconds);
                    backoffTimeSeconds = Math.min(backoffTimeSeconds << 1, MAX_BACKOFF_SECONDS);
                } else {
                    reportReadError(e);
                }
            } catch (IOException e) {
                reportReadError(e);
            }
        } while (!isStopped());
    }

    private Pubsub createAuthorizedClient() {
        try {
            // Create the credential
            HttpTransport httpTransport = Utils.getDefaultTransport();
            JsonFactory jsonFactory = Utils.getDefaultJsonFactory();
            GoogleCredential credential = GoogleCredential.getApplicationDefault(httpTransport, jsonFactory);

            if (credential.createScopedRequired()) {
                credential = credential.createScoped(PubsubScopes.all());
            }
            HttpRequestInitializer initializer = new RetryHttpInitializerWrapper(credential);
            return new Pubsub.Builder(httpTransport, jsonFactory, initializer)
                    .setApplicationName("spark-pubsub-receiver").build();
        } catch (IOException e) {
            reportError("Unable to create Cloud Pub/sub client.", e);
            return null;
        }
    }

    private void reportSubscriptionCreationError(Throwable e) {
        stop("Unable to create subscription: " + subscriptionFullName + " for topic " + topicFullName, e);
    }

    private void reportReadError(Throwable e) {
        stop("Unable to read subscription: " + subscriptionFullName, e);
    }

    private void reportSubscriptionDeleteionError(Throwable e) {
        reportError("Unable to delete subscription: " + subscriptionFullName, e);
    }

    private void wait(int backoffTimeSeconds) {
        try {
            Thread.sleep(1000 * backoffTimeSeconds);
        } catch (InterruptedException ie) {
            Thread.currentThread().interrupt();
        }
    }
}