org.apache.beam.sdk.testing.FileChecksumMatcher.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.beam.sdk.testing.FileChecksumMatcher.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.beam.sdk.testing;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.base.Strings;
import com.google.common.hash.HashCode;
import com.google.common.hash.Hashing;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import javax.annotation.Nonnull;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.util.FluentBackoff;
import org.apache.beam.sdk.util.NumberedShardedFile;
import org.apache.beam.sdk.util.ShardedFile;
import org.apache.beam.sdk.util.Sleeper;
import org.hamcrest.Description;
import org.hamcrest.TypeSafeMatcher;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Matcher to verify file checksum in E2E test.
 *
 * <p>For example:
 * <pre>{@code
 *   assertThat(job, new FileChecksumMatcher(checksumString, filePath));
 * }</pre>
 * or
 * <pre>{@code
 *   assertThat(job, new FileChecksumMatcher(checksumString, filePath, shardTemplate));
 * }</pre>
 *
 * <p>Checksum of outputs is generated based on SHA-1 algorithm. If output file is empty,
 * SHA-1 hash of empty string (da39a3ee5e6b4b0d3255bfef95601890afd80709) is used as expected.
 */
public class FileChecksumMatcher extends TypeSafeMatcher<PipelineResult>
        implements SerializableMatcher<PipelineResult> {

    private static final Logger LOG = LoggerFactory.getLogger(FileChecksumMatcher.class);

    static final int MAX_READ_RETRIES = 4;
    static final Duration DEFAULT_SLEEP_DURATION = Duration.standardSeconds(10L);
    static final FluentBackoff BACK_OFF_FACTORY = FluentBackoff.DEFAULT.withInitialBackoff(DEFAULT_SLEEP_DURATION)
            .withMaxRetries(MAX_READ_RETRIES);

    private static final Pattern DEFAULT_SHARD_TEMPLATE = Pattern
            .compile("(?x) \\S* (?<shardnum> \\d+) -of- (?<numshards> \\d+)");

    private final String expectedChecksum;
    private String actualChecksum;
    private final ShardedFile shardedFile;

    /**
     * Constructor that uses default shard template.
     *
     * @param checksum expected checksum string used to verify file content.
     * @param filePath path of files that's to be verified.
     */
    public FileChecksumMatcher(String checksum, String filePath) {
        this(checksum, filePath, DEFAULT_SHARD_TEMPLATE);
    }

    /**
     * Constructor using a custom shard template.
     *
     * @param checksum expected checksum string used to verify file content.
     * @param filePath path of files that's to be verified.
     * @param shardTemplate template of shard name to parse out the total number of shards
     *                      which is used in I/O retry to avoid inconsistency of filesystem.
     *                      Customized template should assign name "numshards" to capturing
     *                      group - total shard number.
     */
    public FileChecksumMatcher(String checksum, String filePath, Pattern shardTemplate) {
        checkArgument(!Strings.isNullOrEmpty(checksum), "Expected valid checksum, but received %s", checksum);
        checkArgument(!Strings.isNullOrEmpty(filePath), "Expected valid file path, but received %s", filePath);
        checkNotNull(shardTemplate, "Expected non-null shard pattern. "
                + "Please call the other constructor to use default pattern: %s", DEFAULT_SHARD_TEMPLATE);

        this.expectedChecksum = checksum;
        this.shardedFile = new NumberedShardedFile(filePath, shardTemplate);
    }

    /**
     * Constructor using an entirely custom {@link ShardedFile} implementation.
     *
     * <p>For internal use only.
     */
    public FileChecksumMatcher(String expectedChecksum, ShardedFile shardedFile) {
        this.expectedChecksum = expectedChecksum;
        this.shardedFile = shardedFile;
    }

    @Override
    public boolean matchesSafely(PipelineResult pipelineResult) {
        // Load output data
        List<String> outputs;
        try {
            outputs = shardedFile.readFilesWithRetries(Sleeper.DEFAULT, BACK_OFF_FACTORY.backoff());
        } catch (Exception e) {
            throw new RuntimeException(String.format("Failed to read from: %s", shardedFile), e);
        }

        // Verify outputs. Checksum is computed using SHA-1 algorithm
        actualChecksum = computeHash(outputs);
        LOG.debug("Generated checksum: {}", actualChecksum);

        return actualChecksum.equals(expectedChecksum);
    }

    private String computeHash(@Nonnull List<String> strs) {
        if (strs.isEmpty()) {
            return Hashing.sha1().hashString("", StandardCharsets.UTF_8).toString();
        }

        List<HashCode> hashCodes = new ArrayList<>();
        for (String str : strs) {
            hashCodes.add(Hashing.sha1().hashString(str, StandardCharsets.UTF_8));
        }
        return Hashing.combineUnordered(hashCodes).toString();
    }

    @Override
    public void describeTo(Description description) {
        description.appendText("Expected checksum is (").appendText(expectedChecksum).appendText(")");
    }

    @Override
    public void describeMismatchSafely(PipelineResult pResult, Description description) {
        description.appendText("was (").appendText(actualChecksum).appendText(")");
    }
}