dataflow.examples.TransferFiles.java Source code

Java tutorial

Introduction

Here is the source code for dataflow.examples.TransferFiles.java

Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package dataflow.examples;

import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPFile;
import org.apache.commons.net.ftp.FTPReply;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.AvroCoder;
import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
import com.google.cloud.dataflow.sdk.io.AvroIO;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.util.GcsUtil;
import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
import com.google.cloud.dataflow.sdk.values.PCollection;

/**
 * Copy files from an FTP site to GCS.
 */
public class TransferFiles {
    private static final Logger logger = LoggerFactory.getLogger(TransferFiles.class.getName());

    /**
     * Options for the example.
     */
    public static interface Options extends PipelineOptions {
        @Description("Path of the ftp files to match. e.g. ftp://com.ftp/dir/files*")
        String getInput();

        void setInput(String value);

        @Description("GCS path to write the files to.")
        String getOutput();

        void setOutput(String value);
    }

    @DefaultCoder(AvroCoder.class)
    public static class FilePair {
        public FilePair() {
            server = "";
            ftpPath = "";
            gcsPath = "";
        }

        public String server;
        public String ftpPath;
        public String gcsPath;
    }

    public static class FTPToGCS extends DoFn<FilePair, FilePair> {
        @Override
        public void processElement(DoFn<FilePair, FilePair>.ProcessContext c) throws Exception {
            FilePair pair = c.element();
            FTPClient ftp = new FTPClient();
            ftp.connect(pair.server);

            // After connection attempt, you should check the reply code to verify
            // success.
            int reply = ftp.getReplyCode();

            if (!FTPReply.isPositiveCompletion(reply)) {
                ftp.disconnect();
                logger.error("FTP server refused connection.");
                throw new RuntimeException("FTP server refused connection.");
            }

            if (!ftp.login("anonymous", "someemail@gmail.com")) {
                throw new RuntimeException("Failed to login. Reply: " + ftp.getReplyString());
            }

            boolean success = ftp.setFileType(FTP.BINARY_FILE_TYPE);
            if (!success) {
                logger.error("Failed to set mode to binary.");
                throw new RuntimeException("Failed to set mode to binary.");
            }

            ftp.enterLocalPassiveMode();

            GcsUtil gcsUtil = new GcsUtil.GcsUtilFactory().create(c.getPipelineOptions());
            GcsPath gcsPath = GcsPath.fromUri(pair.gcsPath);
            String gcsType = "application/octet-stream";
            WritableByteChannel gcsChannel = gcsUtil.create(gcsPath, gcsType);

            OutputStream gcsStream = Channels.newOutputStream(gcsChannel);
            logger.info("Starting download of: {}", pair.ftpPath);
            boolean result = ftp.retrieveFile(pair.ftpPath, gcsStream);
            if (result) {
                logger.info("Successfully completed download of: {}", pair.ftpPath);
            } else {
                logger.info("Failed to download of: {}. Reply was {}", pair.ftpPath, ftp.getReplyString());
            }

            gcsStream.close();
            ftp.logout();
        }
    }

    public static void main(String[] args) throws IOException, URISyntaxException {
        Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
        List<FilePair> filePairs = new ArrayList<FilePair>();

        URI ftpInput = new URI(options.getInput());

        FTPClient ftp = new FTPClient();
        ftp.connect(ftpInput.getHost());

        // After connection attempt, you should check the reply code to verify
        // success.
        int reply = ftp.getReplyCode();

        if (!FTPReply.isPositiveCompletion(reply)) {
            ftp.disconnect();
            logger.error("FTP server refused connection.");
            throw new RuntimeException("FTP server refused connection.");
        }

        ftp.login("anonymous", "someemail@gmail.com");

        String ftpPath = ftpInput.getPath();
        FTPFile[] files = ftp.listFiles(ftpPath);

        URI gcsUri = null;
        if (options.getOutput().endsWith("/")) {
            gcsUri = new URI(options.getOutput());
        } else {
            gcsUri = new URI(options.getOutput() + "/");
        }

        for (FTPFile f : files) {
            logger.info("File: " + f.getName());
            FilePair p = new FilePair();
            p.server = ftpInput.getHost();
            p.ftpPath = f.getName();

            // URI ftpURI = new URI("ftp", p.server, f.getName(), "");
            p.gcsPath = gcsUri.resolve(FilenameUtils.getName(f.getName())).toString();

            filePairs.add(p);
        }

        ftp.logout();

        Pipeline p = Pipeline.create(options);
        PCollection<FilePair> inputs = p.apply(Create.of(filePairs));
        inputs.apply(ParDo.of(new FTPToGCS()).named("CopyToGCS"))
                .apply(AvroIO.Write.withSchema(FilePair.class).to(options.getOutput()));
        p.run();
    }
}