co.cask.hydrator.plugin.batch.action.FileAction.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.hydrator.plugin.batch.action.FileAction.java

Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin.batch.action;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchActionContext;
import co.cask.cdap.etl.api.batch.PostAction;
import co.cask.hydrator.common.batch.JobUtils;
import co.cask.hydrator.common.batch.action.ConditionConfig;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.annotation.Nullable;

/**
 * Apply action (Delete, Move or Archive) on file(s) at the end of pipeline.
 * The user must specify a path of file(s) on which action required, an action and a target folder if action is
 * either ARCHIVE or MOVE.
 * The user can specify the pattern to apply on file name to filter the files.
 */
@Plugin(type = PostAction.PLUGIN_TYPE)
@Name("FileAction")
@Description("Apply action (Delete, Move or Archive) on file(s) after a pipeline run.")
public class FileAction extends PostAction {
    private static final Logger LOG = LoggerFactory.getLogger(FileAction.class);
    private final Config config;
    private Pattern regex;

    /**
     * Config for the file action plugin.
     */
    public static class Config extends ConditionConfig {
        @Description("Path to file(s) on which action required. "
                + "If a directory is specified, terminate the path name with a \'/\'.")
        private String path;

        @Description("Action to be taken on the file(s). " + "Possible actions are - "
                + "1. None - no action required." + "2. Delete - delete from the HDFS."
                + "3. Archive - archive to the target location." + "4. Moved - move to the target location.")
        private String action;

        @Nullable
        @Description("Target folder path if user select action as either ARCHIVE or MOVE. "
                + "Target folder must be an existing directory.")
        private String targetFolder;

        @Nullable
        @Description("Pattern to select specific file(s)." + "Example - "
                + "1. Use '^' to select file with name start with 'catalog', like '^catalog'."
                + "2. Use '$' to select file with name end with 'catalog.xml', like 'catalog.xml$'."
                + "3. Use '*' to select file with name contains 'catalogBook', like 'catalogBook*'.")
        private final String pattern;

        @VisibleForTesting
        Config(String path, @Nullable String targetFolder, String action, String pattern) {
            super();
            this.path = path;
            this.targetFolder = targetFolder;
            this.action = action;
            this.pattern = pattern;
        }

        @VisibleForTesting
        String getPath() {
            return path;
        }

        @VisibleForTesting
        String getTargetFolder() {
            return targetFolder;
        }

        @VisibleForTesting
        String getAction() {
            return action;
        }

        public void validate() {
            super.validate();
            Preconditions.checkArgument(!Strings.isNullOrEmpty(path), "Path cannot be empty.");
            boolean targetFolderEmpty = (action.equalsIgnoreCase("ARCHIVE") || action.equalsIgnoreCase("MOVE"))
                    && Strings.isNullOrEmpty(targetFolder);
            Preconditions.checkArgument(!targetFolderEmpty, "Target folder cannot be Empty for Action = 'action'.");
        }
    }

    public FileAction(Config config) {
        this.config = config;
    }

    // some config fields are not actually nullable even though they are annotated as such
    // the annotation is only used to tell CDAP that the field is optional, but there is always a default value for it.
    @SuppressWarnings("ConstantConditions")
    @Override
    public void run(BatchActionContext context) throws Exception {
        if (!config.shouldRun(context)) {
            return;
        }
        config.substituteMacros(context);

        Job job = JobUtils.createInstance();
        Configuration conf = job.getConfiguration();
        FileSystem fileSystem = FileSystem.get(conf);
        Path[] paths;
        Path sourcePath = new Path(config.path);
        if (fileSystem.isDirectory(sourcePath)) {
            FileStatus[] status = fileSystem.listStatus(sourcePath);
            paths = FileUtil.stat2Paths(status);
        } else {
            paths = new Path[] { sourcePath };
        }

        //get regex pattern for file name filtering.
        boolean patternSpecified = !Strings.isNullOrEmpty(config.pattern);
        if (patternSpecified) {
            regex = Pattern.compile(config.pattern);
        }

        switch (config.action.toLowerCase()) {
        case "delete":
            for (Path path : paths) {
                if (!patternSpecified || isFileNameMatch(path.getName())) {
                    fileSystem.delete(path, true);
                }
            }
            break;
        case "move":
            for (Path path : paths) {
                if (!patternSpecified || isFileNameMatch(path.getName())) {
                    Path targetFileMovePath = new Path(config.targetFolder, path.getName());
                    fileSystem.rename(path, targetFileMovePath);
                }
            }
            break;
        case "archive":
            for (Path path : paths) {
                if (!patternSpecified || isFileNameMatch(path.getName())) {
                    try (FSDataOutputStream archivedStream = fileSystem
                            .create(new Path(config.targetFolder, path.getName() + ".zip"));
                            ZipOutputStream zipArchivedStream = new ZipOutputStream(archivedStream);
                            FSDataInputStream fdDataInputStream = fileSystem.open(path)) {
                        zipArchivedStream.putNextEntry(new ZipEntry(path.getName()));
                        int length;
                        byte[] buffer = new byte[1024];
                        while ((length = fdDataInputStream.read(buffer)) > 0) {
                            zipArchivedStream.write(buffer, 0, length);
                        }
                        zipArchivedStream.closeEntry();
                    }
                    fileSystem.delete(path, true);
                }
            }
            break;
        default:
            LOG.warn("No action required on the file.");
            break;
        }
    }

    private boolean isFileNameMatch(String fileName) {
        Matcher matcher = regex.matcher(fileName);
        return matcher.find();
    }

    @Override
    public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
        config.validate();
    }

}