org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.beam.sdk.io.gcp.bigquery;

import static com.google.common.base.Preconditions.checkState;

import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.hash.Hashing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.regex.Matcher;
import javax.annotation.Nullable;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.ResolveOptions;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.SerializableFunction;

/** A set of helper functions and classes used by {@link BigQueryIO}. */
public class BigQueryHelpers {
    private static final String RESOURCE_NOT_FOUND_ERROR = "BigQuery %1$s not found for table \"%2$s\" . Please create the %1$s before pipeline"
            + " execution. If the %1$s is created by an earlier stage of the pipeline, this"
            + " validation can be disabled using #withoutValidation.";

    private static final String UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR = "Unable to confirm BigQuery %1$s presence for table \"%2$s\". If the %1$s is created by"
            + " an earlier stage of the pipeline, this validation can be disabled using" + " #withoutValidation.";

    /** Status of a BigQuery job or request. */
    enum Status {
        SUCCEEDED, FAILED, UNKNOWN,
    }

    @Nullable
    /** Return a displayable string representation for a {@link TableReference}. */
    static ValueProvider<String> displayTable(@Nullable ValueProvider<TableReference> table) {
        if (table == null) {
            return null;
        }
        return NestedValueProvider.of(table, new TableRefToTableSpec());
    }

    /** Returns a canonical string representation of the {@link TableReference}. */
    public static String toTableSpec(TableReference ref) {
        StringBuilder sb = new StringBuilder();
        if (ref.getProjectId() != null) {
            sb.append(ref.getProjectId());
            sb.append(":");
        }

        sb.append(ref.getDatasetId()).append('.').append(ref.getTableId());
        return sb.toString();
    }

    static <K, V> List<V> getOrCreateMapListValue(Map<K, List<V>> map, K key) {
        List<V> value = map.get(key);
        if (value == null) {
            value = new ArrayList<>();
            map.put(key, value);
        }
        return value;
    }

    /**
     * Parse a table specification in the form {@code "[project_id]:[dataset_id].[table_id]"} or
     * {@code "[dataset_id].[table_id]"}.
     *
     * <p>If the project id is omitted, the default project id is used.
     */
    public static TableReference parseTableSpec(String tableSpec) {
        Matcher match = BigQueryIO.TABLE_SPEC.matcher(tableSpec);
        if (!match.matches()) {
            throw new IllegalArgumentException(
                    "Table reference is not in [project_id]:[dataset_id].[table_id] " + "format: " + tableSpec);
        }

        TableReference ref = new TableReference();
        ref.setProjectId(match.group("PROJECT"));

        return ref.setDatasetId(match.group("DATASET")).setTableId(match.group("TABLE"));
    }

    static String jobToPrettyString(@Nullable Job job) throws IOException {
        return job == null ? "null" : job.toPrettyString();
    }

    static String statusToPrettyString(@Nullable JobStatus status) throws IOException {
        return status == null ? "Unknown status: null." : status.toPrettyString();
    }

    static Status parseStatus(@Nullable Job job) {
        if (job == null) {
            return Status.UNKNOWN;
        }
        JobStatus status = job.getStatus();
        if (status.getErrorResult() != null) {
            return Status.FAILED;
        } else if (status.getErrors() != null && !status.getErrors().isEmpty()) {
            return Status.FAILED;
        } else {
            return Status.SUCCEEDED;
        }
    }

    @VisibleForTesting
    static String toJsonString(Object item) {
        if (item == null) {
            return null;
        }
        try {
            return BigQueryIO.JSON_FACTORY.toString(item);
        } catch (IOException e) {
            throw new RuntimeException(
                    String.format("Cannot serialize %s to a JSON string.", item.getClass().getSimpleName()), e);
        }
    }

    @VisibleForTesting
    static <T> T fromJsonString(String json, Class<T> clazz) {
        if (json == null) {
            return null;
        }
        try {
            return BigQueryIO.JSON_FACTORY.fromString(json, clazz);
        } catch (IOException e) {
            throw new RuntimeException(String.format("Cannot deserialize %s from a JSON string: %s.", clazz, json),
                    e);
        }
    }

    /**
     * Returns a randomUUID string.
     *
     * <p>{@code '-'} is removed because BigQuery doesn't allow it in dataset id.
     */
    static String randomUUIDString() {
        return UUID.randomUUID().toString().replaceAll("-", "");
    }

    static void verifyTableNotExistOrEmpty(DatasetService datasetService, TableReference tableRef) {
        try {
            if (datasetService.getTable(tableRef) != null) {
                checkState(datasetService.isTableEmpty(tableRef), "BigQuery table is not empty: %s.",
                        toTableSpec(tableRef));
            }
        } catch (IOException | InterruptedException e) {
            if (e instanceof InterruptedException) {
                Thread.currentThread().interrupt();
            }
            throw new RuntimeException(
                    "unable to confirm BigQuery table emptiness for table " + toTableSpec(tableRef), e);
        }
    }

    static void verifyDatasetPresence(DatasetService datasetService, TableReference table) {
        try {
            datasetService.getDataset(table.getProjectId(), table.getDatasetId());
        } catch (Exception e) {
            ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
            if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
                throw new IllegalArgumentException(
                        String.format(RESOURCE_NOT_FOUND_ERROR, "dataset", toTableSpec(table)), e);
            } else if (e instanceof RuntimeException) {
                throw (RuntimeException) e;
            } else {
                throw new RuntimeException(
                        String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "dataset", toTableSpec(table)),
                        e);
            }
        }
    }

    static void verifyTablePresence(DatasetService datasetService, TableReference table) {
        try {
            datasetService.getTable(table);
        } catch (Exception e) {
            ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
            if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
                throw new IllegalArgumentException(
                        String.format(RESOURCE_NOT_FOUND_ERROR, "table", toTableSpec(table)), e);
            } else if (e instanceof RuntimeException) {
                throw (RuntimeException) e;
            } else {
                throw new RuntimeException(
                        String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "table", toTableSpec(table)),
                        e);
            }
        }
    }

    // Create a unique job id for a table load.
    static String createJobId(String prefix, TableDestination tableDestination, int partition) {
        // Job ID must be different for each partition of each table.
        String destinationHash = Hashing.murmur3_128().hashUnencodedChars(tableDestination.toString()).toString();
        if (partition >= 0) {
            return String.format("%s_%s_%05d", prefix, destinationHash, partition);
        } else {
            return String.format("%s_%s", prefix, destinationHash);
        }
    }

    @VisibleForTesting
    static class JsonSchemaToTableSchema implements SerializableFunction<String, TableSchema> {
        @Override
        public TableSchema apply(String from) {
            return fromJsonString(from, TableSchema.class);
        }
    }

    static class TableSchemaToJsonSchema implements SerializableFunction<TableSchema, String> {
        @Override
        public String apply(TableSchema from) {
            return toJsonString(from);
        }
    }

    static class JsonTableRefToTableRef implements SerializableFunction<String, TableReference> {
        @Override
        public TableReference apply(String from) {
            return fromJsonString(from, TableReference.class);
        }
    }

    static class JsonTableRefToTableSpec implements SerializableFunction<String, String> {
        @Override
        public String apply(String from) {
            return toTableSpec(fromJsonString(from, TableReference.class));
        }
    }

    static class TableRefToTableSpec implements SerializableFunction<TableReference, String> {
        @Override
        public String apply(TableReference from) {
            return toTableSpec(from);
        }
    }

    static class TableRefToJson implements SerializableFunction<TableReference, String> {
        @Override
        public String apply(TableReference from) {
            return toJsonString(from);
        }
    }

    @VisibleForTesting
    static class TableSpecToTableRef implements SerializableFunction<String, TableReference> {
        @Override
        public TableReference apply(String from) {
            return parseTableSpec(from);
        }
    }

    static String createJobIdToken(String jobName, String stepUuid) {
        return String.format("beam_job_%s_%s", stepUuid, jobName.replaceAll("-", ""));
    }

    static String getExtractJobId(String jobIdToken) {
        return String.format("%s-extract", jobIdToken);
    }

    static TableReference createTempTableReference(String projectId, String jobUuid) {
        String queryTempDatasetId = "temp_dataset_" + jobUuid;
        String queryTempTableId = "temp_table_" + jobUuid;
        TableReference queryTempTableRef = new TableReference().setProjectId(projectId)
                .setDatasetId(queryTempDatasetId).setTableId(queryTempTableId);
        return queryTempTableRef;
    }

    static String resolveTempLocation(String tempLocationDir, String bigQueryOperationName, String stepUuid) {
        return FileSystems.matchNewResource(tempLocationDir, true)
                .resolve(bigQueryOperationName, ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY)
                .resolve(stepUuid, ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY).toString();
    }
}