org.apache.nifi.processors.mongodb.gridfs.PutGridFS.java Source code

Introduction

Here is the source code for org.apache.nifi.processors.mongodb.gridfs.PutGridFS.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.nifi.processors.mongodb.gridfs;

import com.mongodb.client.gridfs.GridFSBucket;
import com.mongodb.client.gridfs.model.GridFSUploadOptions;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.Validator;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.mongodb.MongoDBClientService;
import org.apache.nifi.processor.DataUnit;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.util.StringUtils;
import org.bson.Document;
import org.bson.types.ObjectId;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@Tags({ "mongo", "gridfs", "put", "file", "store" })
@CapabilityDescription("Writes a file to a GridFS bucket.")
public class PutGridFS extends AbstractGridFSProcessor {

    static final PropertyDescriptor PROPERTIES_PREFIX = new PropertyDescriptor.Builder()
            .name("putgridfs-properties-prefix").displayName("File Properties Prefix")
            .description("Attributes that have this prefix will be added to the file stored in GridFS as metadata.")
            .required(false).expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
            .addValidator(Validator.VALID).build();

    static final AllowableValue NO_UNIQUE = new AllowableValue("none", "None", "No uniqueness will be enforced.");
    static final AllowableValue UNIQUE_NAME = new AllowableValue("name", "Name",
            "Only the filename must " + "be unique.");
    static final AllowableValue UNIQUE_HASH = new AllowableValue("hash", "Hash",
            "Only the file hash must be " + "unique.");
    static final AllowableValue UNIQUE_BOTH = new AllowableValue("both", "Both",
            "Both the filename and hash " + "must be unique.");

    static final PropertyDescriptor ENFORCE_UNIQUENESS = new PropertyDescriptor.Builder()
            .name("putgridfs-enforce-uniqueness").displayName("Enforce Uniqueness")
            .description(
                    "When enabled, this option will ensure that uniqueness is enforced on the bucket. It will do so by creating a MongoDB index "
                            + "that matches your selection. It should ideally be configured once when the bucket is created for the first time because "
                            + "it could take a long time to build on an existing bucket wit a lot of data.")
            .allowableValues(NO_UNIQUE, UNIQUE_BOTH, UNIQUE_NAME, UNIQUE_HASH).defaultValue(NO_UNIQUE.getValue())
            .required(true).build();
    static final PropertyDescriptor HASH_ATTRIBUTE = new PropertyDescriptor.Builder()
            .name("putgridfs-hash-attribute").displayName("Hash Attribute")
            .description(
                    "If uniquness enforcement is enabled and the file hash is part of the constraint, this must be set to an attribute that "
                            + "exists on all incoming flowfiles.")
            .defaultValue("hash.value").expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();
    static final PropertyDescriptor CHUNK_SIZE = new PropertyDescriptor.Builder().name("putgridfs-chunk-size")
            .displayName("Chunk Size")
            .description("Controls the maximum size of each chunk of a file uploaded into GridFS.")
            .defaultValue("256 KB").required(true)
            .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
            .addValidator(StandardValidators.DATA_SIZE_VALIDATOR).build();

    static final PropertyDescriptor FILE_NAME = new PropertyDescriptor.Builder().name("gridfs-file-name")
            .displayName("File Name")
            .description(
                    "The name of the file in the bucket that is the target of this processor. GridFS file names do not "
                            + "include path information because GridFS does not sort files into folders within a bucket.")
            .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES).required(true)
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();

    static final Relationship REL_DUPLICATE = new Relationship.Builder().name("duplicate")
            .description("Flowfiles that fail the duplicate check are sent to this relationship.").build();

    static final String ID_ATTRIBUTE = "gridfs.id";

    static final List<PropertyDescriptor> DESCRIPTORS;
    static final Set<Relationship> RELATIONSHIP_SET;

    static {
        List _temp = new ArrayList<>();
        _temp.addAll(PARENT_PROPERTIES);
        _temp.add(FILE_NAME);
        _temp.add(PROPERTIES_PREFIX);
        _temp.add(ENFORCE_UNIQUENESS);
        _temp.add(HASH_ATTRIBUTE);
        _temp.add(CHUNK_SIZE);
        DESCRIPTORS = Collections.unmodifiableList(_temp);

        Set _rels = new HashSet();
        _rels.addAll(PARENT_RELATIONSHIPS);
        _rels.add(REL_DUPLICATE);
        RELATIONSHIP_SET = Collections.unmodifiableSet(_rels);
    }

    private String uniqueness;
    private String hashAttribute;

    @OnScheduled
    public void onScheduled(ProcessContext context) {
        this.uniqueness = context.getProperty(ENFORCE_UNIQUENESS).getValue();
        this.hashAttribute = context.getProperty(HASH_ATTRIBUTE).evaluateAttributeExpressions().getValue();
        this.clientService = context.getProperty(CLIENT_SERVICE).asControllerService(MongoDBClientService.class);
    }

    @Override
    public Set<Relationship> getRelationships() {
        return RELATIONSHIP_SET;
    }

    @Override
    public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return DESCRIPTORS;
    }

    @Override
    public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
        FlowFile input = session.get();
        if (input == null) {
            return;
        }

        GridFSBucket bucket = getBucket(input, context);

        if (!canUploadFile(context, input, bucket.getBucketName())) {
            getLogger().error("Cannot upload the file because of the uniqueness policy configured.");
            session.transfer(input, REL_DUPLICATE);
            return;
        }

        final int chunkSize = context.getProperty(CHUNK_SIZE).evaluateAttributeExpressions(input)
                .asDataSize(DataUnit.B).intValue();

        try (InputStream fileInput = session.read(input)) {
            String fileName = context.getProperty(FILE_NAME).evaluateAttributeExpressions(input).getValue();
            GridFSUploadOptions options = new GridFSUploadOptions().chunkSizeBytes(chunkSize)
                    .metadata(getMetadata(input, context));
            ObjectId id = bucket.uploadFromStream(fileName, fileInput, options);
            fileInput.close();

            if (id != null) {
                input = session.putAttribute(input, ID_ATTRIBUTE, id.toString());
                session.transfer(input, REL_SUCCESS);
                session.getProvenanceReporter().send(input, getTransitUri(id, input, context));
            } else {
                getLogger().error("ID was null, assuming failure.");
                session.transfer(input, REL_FAILURE);
            }
        } catch (Exception ex) {
            getLogger().error("Failed to upload file", ex);
            session.transfer(input, REL_FAILURE);
        }
    }

    private boolean canUploadFile(ProcessContext context, FlowFile input, String bucketName) {
        boolean retVal;

        if (uniqueness.equals(NO_UNIQUE.getValue())) {
            retVal = true;
        } else {
            final String fileName = input.getAttribute(CoreAttributes.FILENAME.key());
            final String fileColl = String.format("%s.files", bucketName);
            final String hash = input.getAttribute(hashAttribute);

            if ((uniqueness.equals(UNIQUE_BOTH.getValue()) || uniqueness.equals(UNIQUE_HASH.getValue()))
                    && StringUtils.isEmpty(hash)) {
                throw new RuntimeException(
                        String.format("Uniqueness mode %s was set and the hash attribute %s was not found.",
                                uniqueness, hashAttribute));
            }

            Document query;
            if (uniqueness.equals(UNIQUE_BOTH.getValue())) {
                query = new Document().append("filename", fileName).append("md5", hash);
            } else if (uniqueness.equals(UNIQUE_HASH.getValue())) {
                query = new Document().append("md5", hash);
            } else {
                query = new Document().append("filename", fileName);
            }

            retVal = getDatabase(input, context).getCollection(fileColl).count(query) == 0;
        }

        return retVal;
    }

    private Document getMetadata(FlowFile input, ProcessContext context) {
        final String prefix = context.getProperty(PROPERTIES_PREFIX).evaluateAttributeExpressions(input).getValue();
        Document doc;

        if (StringUtils.isEmpty(prefix)) {
            doc = Document.parse("{}");
        } else {
            doc = new Document();
            Map<String, String> attributes = input.getAttributes();
            for (Map.Entry<String, String> entry : attributes.entrySet()) {
                if (entry.getKey().startsWith(prefix)) {
                    String cleanPrefix = prefix.endsWith(".") ? prefix : String.format("%s.", prefix);
                    String cleanKey = entry.getKey().replace(cleanPrefix, "");
                    doc.append(cleanKey, entry.getValue());
                }
            }
        }

        return doc;
    }
}