com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksums.java Source code

Java tutorial

Introduction

Here is the source code for com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksums.java

Source

package com.thinkbiganalytics.nifi.v2.hdfs;

/*-
 * #%L
 * thinkbig-nifi-hadoop-processors
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.gson.Gson;
import com.google.gson.JsonSyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Base64;
import java.util.List;
import java.util.Objects;
import java.util.Set;

import javax.annotation.Nonnull;

@CapabilityDescription("Computes HDFS checksums of list of files")
@EventDriven
@Tags({ "hadoop", "HDFS", "filesystem", "thinkbig", "checksum", "hash", "md5" })
public class ComputeHDFSChecksums extends AbstractHadoopProcessor {

    /**
     * Relationship for failure
     */
    public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
            .description("At least one of the provided checksums don't match computed one").build();

    /**
     * Relationship for success
     */
    public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
            .description("Flow files goes to success relationship").build();

    /**
     * the absolute base directory for the files given by {@link FILES}
     */
    public static final PropertyDescriptor DIRECTORY = new PropertyDescriptor.Builder().name("absolute.path")
            .description(
                    "The absolute path to HDFS directory containing files to check. If not provided file names "
                            + "will be treated as absolute paths")
            .required(false).addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
            .expressionLanguageSupported(true).build();

    /**
     * directs the processor to fail if any of the files given have a provided checksum not matching the one computed by this processor
     */
    public static final PropertyDescriptor FAIL_IF_INCORRECT_CHECKSUM = new PropertyDescriptor.Builder()
            .name("failIfWrongChecksum")
            .description("Decides whether flow should be failed if provided checksum doesn't match computed one")
            .required(true).defaultValue("True").addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .allowableValues(Sets.newHashSet("True", "False")).build();

    /**
     * A JSON encoded list of files and checksums.  file names will be relative to {@link DIRECTORY} or absolute paths.
     */
    public static final PropertyDescriptor FILES = new PropertyDescriptor.Builder().name("files")
            .description("JSON-encoded list of files with their checksums, given like: " + "[{\n"
                    + "   \"name\": \"example\",\n" + "   \"size\": 123456,\n" + "   \"checksum\": {\n"
                    + "      \"length\": 28,\n"
                    + "      \"value\": \"AAAAAAAAAAAAAAAAcLyPS3KoaSFGi/joRB3OUQAAAAA=\",\n"
                    + "      \"algorithm\": \"MD5-of-0MD5-of-0CRC32\"\n" + "   }\n" + "}]")
            .required(true).addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
            .expressionLanguageSupported(true).build();

    /**
     * Output paths to other NiFi processors
     */
    private static final Set<Relationship> relationships = ImmutableSet.of(REL_FAILURE, REL_SUCCESS);

    /**
     * methods to get the properties list
     *
     * @return the list of properties supported by this processor
     */
    @Override
    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return ImmutableList.<PropertyDescriptor>builder().addAll(super.getSupportedPropertyDescriptors())
                .add(DIRECTORY).add(FAIL_IF_INCORRECT_CHECKSUM).add(FILES).build();
    }

    @Override
    protected void modifyConfig(ProcessContext context, Configuration config) {
    }

    /**
     * get the relationships for this processor
     *
     * @return the set of relationships
     */
    @Override
    public Set<Relationship> getRelationships() {
        return relationships;
    }

    @Override
    public void onTrigger(@Nonnull final ProcessContext context, @Nonnull final ProcessSession session)
            throws ProcessException {
        FlowFile flowFile = session.get();
        if (flowFile == null) {
            return;
        }
        final FileSystem fs = getFileSystem(context);
        if (fs == null) {
            getLog().error("Couldn't initialize HDFS");
            session.transfer(flowFile, REL_FAILURE);
            return;
        }
        String filesJSON = context.getProperty(FILES).evaluateAttributeExpressions(flowFile).getValue();
        String absolutePath = context.getProperty(DIRECTORY).evaluateAttributeExpressions(flowFile).getValue();
        Boolean failIfWrongChecksum = context.getProperty(FAIL_IF_INCORRECT_CHECKSUM)
                .evaluateAttributeExpressions(flowFile).asBoolean();
        Gson jsonParser = new Gson();
        File[] filesList;
        try {
            filesList = jsonParser.fromJson(filesJSON, File[].class);
            if (filesList == null) {
                filesList = new File[0];
            }

            for (File f : filesList) {
                String name = f.getName();
                Path filePath;
                if (absolutePath == null || absolutePath.isEmpty()) {
                    filePath = new Path(name);
                } else {
                    filePath = new Path(absolutePath, name);
                }
                FileChecksum computed_checksum = fs.getFileChecksum(filePath);
                String b64_checksum = Base64.getEncoder().encodeToString(computed_checksum.getBytes());
                f.setComputedChecksum(
                        new Checksum(b64_checksum.length(), b64_checksum, computed_checksum.getAlgorithmName()));
                if (failIfWrongChecksum && !Objects.equals(b64_checksum, f.getChecksum().getValue())) {
                    getLog().error("Checksums don't match! File: " + filePath.toString() + " checksum provided: "
                            + f.getChecksum().getValue() + " checksum computed: " + b64_checksum);
                    session.transfer(flowFile, REL_FAILURE);
                    return;
                }
            }
        } catch (JsonSyntaxException e) {
            getLog().error("Files list attribute does not contain a proper JSON array");
            session.transfer(flowFile, REL_FAILURE);
            return;
        } catch (FileNotFoundException e) {
            getLog().error("One of the provided files not found.\n" + e.getMessage());
            session.transfer(flowFile, REL_FAILURE);
            return;
        } catch (IOException e) {
            throw new ProcessException(e);
        }
        flowFile = session.putAttribute(flowFile, FILES.getName(), jsonParser.toJson(filesList));
        session.transfer(flowFile, REL_SUCCESS);
    }

    class File {

        private String name;
        private Integer size;
        private Checksum checksum;
        private Checksum computedChecksum;

        public Checksum getComputedChecksum() {
            return computedChecksum;
        }

        public void setComputedChecksum(Checksum computedChecksum) {
            this.computedChecksum = computedChecksum;
        }

        public String getName() {
            return name;
        }

        public void setName(String name) {
            this.name = name;
        }

        public Integer getSize() {
            return size;
        }

        public void setSize(Integer size) {
            this.size = size;
        }

        public Checksum getChecksum() {
            return checksum;
        }

        public void setChecksum(Checksum checksum) {
            this.checksum = checksum;
        }
    }

    class Checksum {

        private Integer length;
        private String value;
        private String algorithm;

        public Checksum(Integer length, String value, String algorithm) {
            this.length = length;
            this.value = value;
            this.algorithm = algorithm;
        }

        public Integer getLength() {
            return length;
        }

        public void setLength(Integer length) {
            this.length = length;
        }

        public String getValue() {
            return value;
        }

        public void setValue(String value) {
            this.value = value;
        }

        public String getAlgorithm() {
            return algorithm;
        }

        public void setAlgorithm(String algorithm) {
            this.algorithm = algorithm;
        }
    }
}