org.apache.nifi.processors.hadoop.GetHDFSSequenceFile.java Source code

Introduction

Here is the source code for org.apache.nifi.processors.hadoop.GetHDFSSequenceFile.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nifi.processors.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.nifi.annotation.behavior.TriggerWhenEmpty;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.SeeAlso;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.DataUnit;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processors.hadoop.util.SequenceFileReader;
import org.apache.nifi.util.StopWatch;

import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

/**
 * This processor is used to pull files from HDFS. The files being pulled in MUST be SequenceFile formatted files. The processor creates a flow file for each key/value entry in the ingested
 * SequenceFile. The created flow file's content depends on the value of the optional configuration property FlowFile Content. Currently, there are two choices: VALUE ONLY and KEY VALUE PAIR. With the
 * prior, only the SequenceFile value element is written to the flow file contents. With the latter, the SequenceFile key and value are written to the flow file contents as serialized objects; the
 * format is key length (int), key(String), value length(int), value(bytes). The default is VALUE ONLY.
 * <p>
 * NOTE: This processor loads the entire value entry into memory. While the size limit for a value entry is 2GB, this will cause memory problems if there are too many concurrent tasks and the data
 * being ingested is large.
 *
 */
@TriggerWhenEmpty
@Tags({ "hadoop", "HDFS", "get", "fetch", "ingest", "source", "sequence file" })
@CapabilityDescription("Fetch sequence files from Hadoop Distributed File System (HDFS) into FlowFiles")
@SeeAlso(PutHDFS.class)
public class GetHDFSSequenceFile extends GetHDFS {

    static final String VALUE_ONLY = "VALUE ONLY";

    static final PropertyDescriptor FLOWFILE_CONTENT = new PropertyDescriptor.Builder().name("FlowFile Content")
            .description(
                    "Indicate if the content is to be both the key and value of the Sequence File, or just the value.")
            .allowableValues(VALUE_ONLY, "KEY VALUE PAIR").defaultValue(VALUE_ONLY).required(true).build();

    @Override
    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        List<PropertyDescriptor> someProps = new ArrayList<>(super.getSupportedPropertyDescriptors());
        someProps.add(FLOWFILE_CONTENT);
        return Collections.unmodifiableList(someProps);
    }

    @Override
    protected void processBatchOfFiles(final List<Path> files, final ProcessContext context,
            final ProcessSession session) {
        final Configuration conf = getConfiguration();
        final FileSystem hdfs = getFileSystem();
        final String flowFileContentValue = context.getProperty(FLOWFILE_CONTENT).getValue();
        final boolean keepSourceFiles = context.getProperty(KEEP_SOURCE_FILE).asBoolean();
        final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
        if (bufferSizeProp != null) {
            int bufferSize = bufferSizeProp.intValue();
            conf.setInt(BUFFER_SIZE_KEY, bufferSize);
        }
        ComponentLog logger = getLogger();
        final SequenceFileReader<Set<FlowFile>> reader;
        if (flowFileContentValue.equalsIgnoreCase(VALUE_ONLY)) {
            reader = new ValueReader(session);
        } else {
            reader = new KeyValueReader(session);
        }
        Set<FlowFile> flowFiles = Collections.emptySet();
        for (final Path file : files) {
            if (!this.isScheduled()) {
                break; // This processor should stop running immediately.
            }

            final StopWatch stopWatch = new StopWatch(false);
            try {
                stopWatch.start();
                if (!hdfs.exists(file)) {
                    continue; // If file is no longer here move on.
                }
                logger.debug("Reading file");
                flowFiles = getFlowFiles(conf, hdfs, reader, file);
                if (!keepSourceFiles && !hdfs.delete(file, false)) {
                    logger.warn("Unable to delete path " + file.toString()
                            + " from HDFS.  Will likely be picked up over and over...");
                }
            } catch (Throwable t) {
                logger.error("Error retrieving file {} from HDFS due to {}", new Object[] { file, t });
                session.rollback();
                context.yield();
            } finally {
                stopWatch.stop();
                long totalSize = 0;
                for (FlowFile flowFile : flowFiles) {
                    totalSize += flowFile.getSize();
                    session.getProvenanceReporter().receive(flowFile, file.toString());
                }
                if (totalSize > 0) {
                    final String dataRate = stopWatch.calculateDataRate(totalSize);
                    final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
                    logger.info(
                            "Created {} flowFiles from SequenceFile {}. Ingested in {} milliseconds at a rate of {}",
                            new Object[] { flowFiles.size(), file.toUri().toASCIIString(), millis, dataRate });
                    logger.info("Transferred flowFiles {}  to success", new Object[] { flowFiles });
                    session.transfer(flowFiles, REL_SUCCESS);
                }
            }
        }
    }

    protected Set<FlowFile> getFlowFiles(final Configuration conf, final FileSystem hdfs,
            final SequenceFileReader<Set<FlowFile>> reader, final Path file) throws Exception {
        PrivilegedExceptionAction<Set<FlowFile>> privilegedExceptionAction = new PrivilegedExceptionAction<Set<FlowFile>>() {
            @Override
            public Set<FlowFile> run() throws Exception {
                return reader.readSequenceFile(file, conf, hdfs);
            }
        };
        UserGroupInformation userGroupInformation = getUserGroupInformation();
        if (userGroupInformation == null) {
            return privilegedExceptionAction.run();
        } else {
            return userGroupInformation.doAs(privilegedExceptionAction);
        }
    }
}