com.thinkbiganalytics.nifi.v2.hdfs.DistCopyHDFS.java Source code

Java tutorial

Introduction

Here is the source code for com.thinkbiganalytics.nifi.v2.hdfs.DistCopyHDFS.java

Source

package com.thinkbiganalytics.nifi.v2.hdfs;

/*-
 * #%L
 * thinkbig-nifi-hadoop-processors
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.gson.Gson;
import com.google.gson.JsonSyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import javax.annotation.Nonnull;

/**
 * Processor for performing a distributed copy between two HDFS clusters
 *
 * @see <a href="http://hadoop.apache.org/docs/stable/hadoop-distcp/DistCp.html">http://hadoop.apache.org/docs/stable/hadoop-distcp/DistCp.html</a>
 */
@CapabilityDescription("Copies files from one HDFS location into another using DistCp MR job")
@EventDriven
@Tags({ "hadoop", "HDFS", "filesystem", "thinkbig", "copy", "distributed copy", "distcp" })
public class DistCopyHDFS extends AbstractHadoopProcessor {

    /**
     * Relationship for failure
     */
    public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
            .description("At least one of the provided files not found").build();

    /**
     * Relationship for success
     */
    public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
            .description("Files copied to new location").build();

    /**
     * Property to define the source path, which is used as a base for all {@link FILES}
     */
    public static final PropertyDescriptor SOURCE = new PropertyDescriptor.Builder().name("source.path")
            .description(
                    "Absolute source path, if provided along with 'files' parameter will be treated as absolute "
                            + "path for relative paths provided in that parameter")
            .required(false).addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
            .expressionLanguageSupported(true).build();

    /**
     * Property that defines the destination path as an absolute path
     */
    public static final PropertyDescriptor DESTINATION = new PropertyDescriptor.Builder().name("destination.path")
            .description("Absolute destination path").required(true)
            .addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
            .expressionLanguageSupported(true).build();

    /**
     * property which is a JSON encoded list of files of the form:
     * [{ "name" : "example" }]
     */
    public static final PropertyDescriptor FILES = new PropertyDescriptor.Builder().name("files")
            .description("JSON-encoded list of files, given like: " + "[{\n" + "   \"name\": \"example\",\n"
                    + " }\n" + "]")
            .required(false).addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
            .expressionLanguageSupported(true).build();

    /**
     * Output paths to other NiFi processors
     */
    private static final Set<Relationship> relationships = ImmutableSet.of(REL_FAILURE, REL_SUCCESS);

    @Override
    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return ImmutableList.<PropertyDescriptor>builder().addAll(super.getSupportedPropertyDescriptors())
                .add(DESTINATION).add(SOURCE).add(FILES).build();
    }

    /**
     * get the relationships required for the Processor
     *
     * @return a set of relationships
     */
    @Override
    public Set<Relationship> getRelationships() {
        return relationships;
    }

    /**
     * onTrigger is called when the flow file proceeds through the processor
     *
     * @param context passed in by the framework and provides access to the data configured in the processor
     * @param session passed in by the framework and provides access to the flow file
     * @throws ProcessException if any framework actions fail
     */
    @Override
    public void onTrigger(@Nonnull final ProcessContext context, @Nonnull final ProcessSession session)
            throws ProcessException {
        FlowFile flowFile = session.get();
        if (flowFile == null) {
            return;
        }
        final FileSystem fs = getFileSystem(context);
        if (fs == null) {
            getLog().error("Couldn't initialize HDFS");
            session.transfer(flowFile, REL_FAILURE);
            return;
        }
        String filesJSON = context.getProperty(FILES).evaluateAttributeExpressions(flowFile).getValue();
        String source = context.getProperty(SOURCE).evaluateAttributeExpressions(flowFile).getValue();
        String destination = context.getProperty(DESTINATION).evaluateAttributeExpressions(flowFile).getValue();
        Gson jsonParser = new Gson();
        File[] filesList;
        ArrayList<Path> pathsList = new ArrayList<>();
        try {
            if (!(filesJSON == null) && !filesJSON.isEmpty()) {
                filesList = jsonParser.fromJson(filesJSON, File[].class);
                if (filesList == null) {
                    filesList = new File[0];
                }
                if (source != null && !source.isEmpty()) {
                    for (File f : filesList) {
                        pathsList.add(new Path(source, f.getName()));
                    }
                } else {
                    for (File f : filesList) {
                        pathsList.add(new Path(f.getName()));
                    }
                }
            } else {
                if (source == null || source.isEmpty()) {
                    getLog().error(String.format("At least one of attributes: %s or %s needs to be set",
                            SOURCE.getName(), FILES.getName()));

                    session.transfer(flowFile, REL_FAILURE);
                    return;
                }
                pathsList.add(new Path(source));
            }
            DistCp distCp = getDistCp(pathsList, new Path(destination));
            Job job = distCp.execute();
            job.waitForCompletion(false);
        } catch (JsonSyntaxException e) {
            getLog().error("Files list attribute does not contain a proper JSON array");
            session.transfer(flowFile, REL_FAILURE);
            return;
        } catch (Exception e) {
            getLog().error("Exception during processor execution: " + e.getMessage());
            session.transfer(flowFile, REL_FAILURE);
            return;
        }
        session.transfer(flowFile, REL_SUCCESS);
    }

    @Override
    protected void modifyConfig(ProcessContext context, Configuration config) {
    }

    /**
     * method to construct a new DistCp object to perform the distcp
     *
     * @param pathsList   A list of paths to be recursively copied from one cluster to another
     * @param destination The root location on the target cluster
     * @return a DistCp object
     * @throws Exception if the construction of the {@link DistCp} object fails for any reason
     */
    protected DistCp getDistCp(List<Path> pathsList, Path destination) throws Exception {
        final Configuration conf = getConfiguration();
        DistCpOptions opts = new DistCpOptions(pathsList, destination);
        return new DistCp(conf, opts);
    }

    class File {

        private String name;

        public File(String name) {
            this.name = name;
        }

        public String getName() {
            return name;
        }

        public void setName(String name) {
            this.name = name;
        }
    }

}