com.microsoft.azure.management.datalake.store.uploader.DataLakeStoreUploader.java Source code

Java tutorial

Introduction

Here is the source code for com.microsoft.azure.management.datalake.store.uploader.DataLakeStoreUploader.java

Source

/**
 * Copyright (c) Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See License.txt in the project root for
 * license information.
 */
package com.microsoft.azure.management.datalake.store.uploader;

import org.apache.commons.lang3.StringUtils;

import javax.management.OperationsException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

/**
 * Represents a general purpose file uploader into DataLake. Supports the efficient upload of large files.
 */
public class DataLakeStoreUploader {

    /**
     * The maximum number of parallel threads to allow.
     */
    public static final int MAX_ALLOWED_THREADS = 1024;
    private FrontEndAdapter frontEnd;
    private String metadataFilePath;

    /**
     * Creates a new instance of the DataLakeUploader class, by specifying a pointer to the FrontEnd to use for the upload.
     *
     * @param uploadParameters The upload parameters to use.
     * @param frontEnd A pointer to the FrontEnd interface to use for the upload.
     * @throws FileNotFoundException if the local file cannot be found or is inaccessible
     */
    public DataLakeStoreUploader(UploadParameters uploadParameters, FrontEndAdapter frontEnd)
            throws FileNotFoundException {
        this.parameters = uploadParameters;
        this.frontEnd = frontEnd;

        //ensure that input parameters are correct
        validateParameters();
        metadataFilePath = getCanonicalMetadataFilePath();
    }

    /**
     * Gets the canonical metadata file path.
     *
     * @return A string representation of the canonical metadata file path.
     */
    private String getCanonicalMetadataFilePath() {
        return Paths.get(this.getParameters().getLocalMetadataLocation(), MessageFormat.format("{0}.upload.xml",
                Paths.get(this.getParameters().getInputFilePath()).getFileName())).toString();
    }

    private UploadParameters parameters;

    /**
     * Gets the parameters to use for this upload.
     * @return the parameters for this upload.
     */
    public UploadParameters getParameters() {
        return parameters;
    }

    /**
     * Executes the upload as defined by the input parameters.
     *
     * @throws Exception if there is any failure that occurs during execution.
     */
    public void execute() throws Exception {
        //load up existing metadata or create a fresh one
        UploadMetadata metadata = getMetadata();

        if (metadata.getSegmentCount() < this.getParameters().getThreadCount()) {
            // reducing the thread count to make it equal to the segment count
            // if it is larger, since those extra threads will not be used.
            this.getParameters().setThreadCount(metadata.getSegmentCount());
        }

        //begin (or resume) uploading the file
        uploadFile(metadata);

        //clean up metadata after a successful upload
        metadata.deleteFile();
    }

    /**
     * Validates the parameters.
     *
     * @throws FileNotFoundException Could not find input file
     * @throws IllegalArgumentException Null or empty account name, stream path should not end with a '/' or the thread count is out of range.
     */
    private void validateParameters() throws FileNotFoundException, IllegalArgumentException {
        if (!(new File(this.getParameters().getInputFilePath()).exists())) {
            throw new FileNotFoundException(
                    "Could not find input file: " + this.getParameters().getInputFilePath());
        }

        if (this.getParameters().getTargetStreamPath() == null
                || StringUtils.isEmpty(this.getParameters().getTargetStreamPath())) {
            throw new IllegalArgumentException("Null or empty Target Stream path");
        }

        if (this.getParameters().getTargetStreamPath().endsWith("/")) {
            throw new IllegalArgumentException("Invalid TargetStreamPath, a stream path should not end with /");
        }

        if (this.getParameters().getAccountName() == null
                || StringUtils.isEmpty(this.getParameters().getAccountName())) {
            throw new IllegalArgumentException("Null or empty Account Name");
        }

        if (this.getParameters().getThreadCount() < 1
                || this.getParameters().getThreadCount() > MAX_ALLOWED_THREADS) {
            throw new IllegalArgumentException(
                    MessageFormat.format("ThreadCount must be at least 1 and at most {0}", MAX_ALLOWED_THREADS));
        }
    }

    /**
     * Gets the metadata.
     *
     * @return The {@link UploadMetadata} used by this upload.
     * @throws IOException
     * @throws InvalidMetadataException
     * @throws UploadFailedException
     */
    private UploadMetadata getMetadata() throws IOException, InvalidMetadataException, UploadFailedException {
        UploadMetadataGenerator metadataGenerator = new UploadMetadataGenerator(this.parameters);
        if (this.getParameters().isResume()) {
            return metadataGenerator.getExistingMetadata(metadataFilePath);
        } else {
            return metadataGenerator.createNewMetadata(metadataFilePath);
        }
    }

    /**
     * Deletes the metadata file from disk.
     */
    public void deleteMetadataFile() {
        File toDelete = new File(metadataFilePath);
        if (toDelete.exists()) {
            toDelete.delete();
        }
    }

    /**
     * Validates that the metadata is valid for a resume operation, and also updates the internal Segment States to match what the Server looks like.
     * If any changes are made, the metadata will be saved to its canonical location.
     *
     * @param metadata The {@link UploadMetadata} to resume the upload from.
     * @throws Exception
     */
    private void validateMetadataForResume(UploadMetadata metadata) throws Exception {
        validateMetadataMatchesLocalFile(metadata);

        //verify that the target stream does not already exist (in case we don't want to overwrite)
        if (!this.getParameters().isOverwrite() && frontEnd.streamExists(metadata.getTargetStreamPath())) {
            throw new OperationsException("Target Stream already exists");
        }

        //make sure we don't upload part of the file as binary, while the rest is non-binary (that's just asking for trouble)
        if (this.getParameters().isBinary() != metadata.isBinary()) {
            throw new OperationsException(MessageFormat.format(
                    "Existing metadata was created for a {0}binary file while the current parameters requested a {1}binary upload.",
                    metadata.isBinary() ? "" : "non-", this.getParameters().isBinary() ? "" : "non-"));
        }

        //see what files(segments) already exist - update metadata accordingly (only for segments that are missing from server; if it's on the server but not in metadata, reupload)
        for (UploadSegmentMetadata segment : metadata.getSegments()) {
            if (segment.getStatus() == SegmentUploadStatus.Complete) {
                int retryCount = 0;
                while (retryCount < SingleSegmentUploader.MAX_BUFFER_UPLOAD_ATTEMPT_COUNT) {
                    retryCount++;
                    try {
                        //verify that the stream exists and that the length is as expected
                        if (!frontEnd.streamExists(segment.getPath())) {
                            // this segment was marked as completed, but no target stream exists; it needs to be reuploaded
                            segment.setStatus(SegmentUploadStatus.Pending);
                        } else {
                            long remoteLength = frontEnd.getStreamLength(segment.getPath());
                            if (remoteLength != segment.getLength()) {
                                //the target stream has a different length than the input segment, which implies they are inconsistent; it needs to be reuploaded
                                segment.setStatus(SegmentUploadStatus.Pending);
                            }
                        }

                        break;
                    } catch (Exception e) {
                        if (retryCount >= SingleSegmentUploader.MAX_BUFFER_UPLOAD_ATTEMPT_COUNT) {
                            throw new UploadFailedException(MessageFormat.format(
                                    "Cannot validate metadata in order to resume due to the following exception retrieving file information: {0}",
                                    e));
                        }

                        SingleSegmentUploader.waitForRetry(retryCount,
                                parameters.isUseSegmentBlockBackOffRetryStrategy());
                    }
                }
            } else {
                //anything which is not in 'Completed' status needs to be reuploaded
                segment.setStatus(SegmentUploadStatus.Pending);
            }
        }
        metadata.save();
    }

    /**
     * Verifies that the metadata is valid for a fresh upload.
     *
     * @param metadata {@link UploadMetadata} to validate for a fresh upload.
     * @throws Exception
     */
    private void validateMetadataForFreshUpload(UploadMetadata metadata) throws Exception {
        validateMetadataMatchesLocalFile(metadata);

        //verify that the target stream does not already exist (in case we don't want to overwrite)
        if (!this.getParameters().isOverwrite() && frontEnd.streamExists(metadata.getTargetStreamPath())) {
            throw new OperationsException("Target Stream already exists");
        }
    }

    /**
     * Verifies that the metadata is consistent with the local file information.
     *
     * @param metadata The {@link UploadMetadata} to check against a serialized copy.
     * @throws OperationsException
     */
    private void validateMetadataMatchesLocalFile(UploadMetadata metadata) throws OperationsException {
        if (!metadata.getTargetStreamPath().trim()
                .equalsIgnoreCase(this.getParameters().getTargetStreamPath().trim())) {
            throw new OperationsException("Metadata points to a different target stream than the input parameters");
        }

        //verify that it matches against local file (size, name)
        File metadataInputFileInfo = new File(metadata.getInputFilePath());
        File paramInputFileInfo = new File(this.getParameters().getInputFilePath());

        if (!paramInputFileInfo.toString().toLowerCase().equals(metadataInputFileInfo.toString().toLowerCase())) {
            throw new OperationsException("The metadata refers to different file than the one requested");
        }

        if (!metadataInputFileInfo.exists()) {
            throw new OperationsException("The metadata refers to a file that does not exist");
        }

        if (metadata.getFileLength() != metadataInputFileInfo.length()) {
            throw new OperationsException("The metadata's file information differs from the actual file");
        }
    }

    /**
     * Uploads the file using the given metadata.
     * @param metadata The {@link UploadMetadata} to use to upload the file.
     * @throws Exception
     */
    private void uploadFile(UploadMetadata metadata) throws Exception {
        try {
            //TODO: figure out if we need a ServicePointManager equivalent for the connection limit
            //match up the metadata with the information on the server
            if (this.getParameters().isResume()) {
                validateMetadataForResume(metadata);
            } else {
                validateMetadataForFreshUpload(metadata);
            }

            // TODO: figure out if we need a way to track progress.
            if (metadata.getSegmentCount() == 0) {
                // simply create the target stream, overwriting existing streams if they exist
                frontEnd.createStream(metadata.getTargetStreamPath(), true, null, 0);
            } else if (metadata.getSegmentCount() > 1) {
                //perform the multi-segment upload
                MultipleSegmentUploader msu = new MultipleSegmentUploader(metadata,
                        this.getParameters().getThreadCount(), frontEnd);
                msu.setUseSegmentBlockBackOffRetryStrategy(
                        this.getParameters().isUseSegmentBlockBackOffRetryStrategy());
                msu.upload();

                //concatenate the files at the end
                concatenateSegments(metadata);
            } else {
                //optimization if we only have one segment: upload it directly to the target stream
                UploadSegmentMetadata[] toUse = metadata.getSegments();
                toUse[0].setPath(metadata.getTargetStreamPath());
                metadata.setSegments(toUse);
                SingleSegmentUploader ssu = new SingleSegmentUploader(0, metadata, frontEnd);
                ssu.setUseBackOffRetryStrategy(this.getParameters().isUseSegmentBlockBackOffRetryStrategy());
                ssu.upload();
            }
        } catch (InterruptedException ex) {
            // do nothing since we have already marked everything as failed
        }
    }

    /**
     * Concatenates all the segments defined in the metadata into a single stream.
     *
     * @param metadata The {@link UploadMetadata} to determine the segments to concatenate
     * @throws Exception
     */
    private void concatenateSegments(final UploadMetadata metadata) throws Exception {
        final String[] inputPaths = new String[metadata.getSegmentCount()];

        //verify if target stream exists
        if (frontEnd.streamExists(metadata.getTargetStreamPath())) {
            if (this.getParameters().isOverwrite()) {
                frontEnd.deleteStream(metadata.getTargetStreamPath(), false);
            } else {
                throw new OperationsException("Target Stream already exists");
            }
        }

        //ensure all input streams exist and are of the expected length
        //ensure all segments in the metadata are marked as 'complete'
        final List<Exception> exceptions = new ArrayList<>();
        ExecutorService exec = Executors.newFixedThreadPool(this.getParameters().getThreadCount());
        for (int i = 0; i < metadata.getSegmentCount(); i++) {
            final int finalI = i;
            exec.submit(new Runnable() {
                @Override
                public void run() {
                    try {
                        if (metadata.getSegments()[finalI].getStatus() != SegmentUploadStatus.Complete) {
                            throw new UploadFailedException(
                                    "Cannot perform 'concatenate' operation because not all streams are fully uploaded.");
                        }

                        String remoteStreamPath = metadata.getSegments()[finalI].getPath();
                        int retryCount = 0;
                        long remoteLength = -1;

                        while (retryCount < SingleSegmentUploader.MAX_BUFFER_UPLOAD_ATTEMPT_COUNT) {
                            retryCount++;
                            try {
                                remoteLength = frontEnd.getStreamLength(remoteStreamPath);
                                break;
                            } catch (Exception e) {
                                if (retryCount >= SingleSegmentUploader.MAX_BUFFER_UPLOAD_ATTEMPT_COUNT) {
                                    throw new UploadFailedException(MessageFormat.format(
                                            "Cannot perform 'concatenate' operation due to the following exception retrieving file information: {0}",
                                            e));
                                }

                                SingleSegmentUploader.waitForRetry(retryCount,
                                        parameters.isUseSegmentBlockBackOffRetryStrategy());
                            }
                        }

                        if (remoteLength != metadata.getSegments()[finalI].getLength()) {
                            throw new UploadFailedException(MessageFormat.format(
                                    "Cannot perform 'concatenate' operation because segment {0} has an incorrect length (expected {1}, actual {2}).",
                                    finalI, metadata.getSegments()[finalI].getLength(), remoteLength));
                        }

                        inputPaths[finalI] = remoteStreamPath;

                    } catch (Exception ex) {
                        //collect any exceptions, whether we just generated them above or whether they come from the Front End,
                        synchronized (exceptions) {
                            exceptions.add(ex);
                        }
                    }
                }
            });
        }

        exec.shutdown();

        try {
            exec.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); // waits ~292 years for completion or interruption.
        } catch (InterruptedException e) {
            // add the exception since it will indicate that it was cancelled.
            exceptions.add(e);
        }

        if (exceptions.size() > 0) {
            throw new AggregateUploadException("At least one concatenate test failed", exceptions.remove(0),
                    exceptions);
        }

        //issue the command
        frontEnd.concatenate(metadata.getTargetStreamPath(), inputPaths);
    }
}