com.datatorrent.lib.io.fs.AbstractHdfsFileOutputOperator.java Source code

Introduction

Here is the source code for com.datatorrent.lib.io.fs.AbstractHdfsFileOutputOperator.java
Source

/*
 * Copyright (c) 2014 DataTorrent, Inc. ALL Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datatorrent.lib.io.fs;

import java.io.BufferedOutputStream;
import java.io.IOException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.datatorrent.api.BaseOperator;
import com.datatorrent.api.Context.OperatorContext;
import com.datatorrent.api.DefaultInputPort;

/**
 * Base class for HDFS file output operators.
 * Contains base implementations for setup, teardown, open file and close file.
 *
 * @param <INPUT> incoming tuple type
 * @since 1.0.2
 */
public abstract class AbstractHdfsFileOutputOperator<INPUT> extends BaseOperator {
    protected transient FSDataOutputStream fsOutput;
    protected transient BufferedOutputStream bufferedOutput;
    protected transient FileSystem fs;
    protected String filePath;
    protected long totalBytesWritten = 0;
    protected boolean append = true;
    protected int bufferSize = 0;
    protected int replication = 0;
    public final transient DefaultInputPort<INPUT> input = new DefaultInputPort<INPUT>() {
        @Override
        public void process(INPUT t) {
            processTuple(t);
        }

    };

    /**
     * Function to be implemented to process each incoming tuple
     *
     * @param t incoming tuple
     */
    protected abstract void processTuple(INPUT t);

    /**
     * This function opens the stream to given path.
     *
     * @param filepath
     * @throws IOException
     */
    protected void openFile(Path filepath) throws IOException {
        if (replication <= 0) {
            replication = fs.getDefaultReplication(filepath);
        }
        if (fs.exists(filepath)) {
            if (append) {
                fsOutput = fs.append(filepath);
                logger.debug("appending to {}", filepath);
            } else {
                fs.delete(filepath, true);
                fsOutput = fs.create(filepath, (short) replication);
                logger.debug("creating {} with replication {}", filepath, replication);
            }
        } else {
            fsOutput = fs.create(filepath, (short) replication);
            logger.debug("creating {} with replication {}", filepath, replication);
        }
        if (bufferSize > 0) {
            this.bufferedOutput = new BufferedOutputStream(fsOutput, bufferSize);
            logger.debug("buffering with size {}", bufferSize);
        }

    }

    protected void closeFile() throws IOException {
        if (bufferedOutput != null) {
            bufferedOutput.close();
            bufferedOutput = null;
        }
        if (fsOutput != null) {
            fsOutput.close();
            fsOutput = null;
        }
    }

    /**
     *
     * @param context
     */
    @Override
    public void setup(OperatorContext context) {
        try {
            fs = FileSystem.newInstance(new Configuration());
        } catch (IOException ex) {
            throw new RuntimeException(ex);
        }
    }

    @Override
    public void teardown() {
        try {
            closeFile();
            if (fs != null) {
                fs.close();
            }
        } catch (IOException ex) {
            throw new RuntimeException(ex);
        }
        fs = null;
        append = false;
    }

    /**
     * The file name. This can be a relative path for the default file system or fully qualified URL as accepted by (
     * {@link org.apache.hadoop.fs.Path}). For splits with per file size limit, the name needs to contain substitution
     * tokens to generate unique file names. Example: file:///mydir/adviews.out.%(operatorId).part-%(partIndex)
     *
     * @param filePath
     * The pattern of the output file
     */
    public void setFilePath(String filePath) {
        this.filePath = filePath;
    }

    /**
     * This returns the pattern of the output file
     *
     * @return
     */
    public String getFilePath() {
        return this.filePath;
    }

    /**
     * Append to existing file. Default is true.
     *
     * @param append
     * This specifies if there exists a file with same name, then should the operator append to the existing file
     */
    public void setAppend(boolean append) {
        this.append = append;
    }

    /**
     * Bytes are written to the underlying file stream once they cross this size.<br>
     * Use this parameter if the file system used does not provide sufficient buffering. HDFS does buffering (even though
     * another layer of buffering on top appears to help) but other file system abstractions may not. <br>
     *
     * @param bufferSize
     */
    public void setBufferSize(int bufferSize) {
        this.bufferSize = bufferSize;
    }

    /**
     * Replication factor. Value <= 0 indicates that the file systems default replication setting is used.
     *
     * @param replication
     */
    public void setReplication(int replication) {
        this.replication = replication;
    }

    public long getTotalBytesWritten() {
        return totalBytesWritten;
    }

    /**
     * This function returns the byte array for the given tuple.
     *
     * @param t
     * @return
     */
    protected abstract byte[] getBytesForTuple(INPUT t);

    private static final Logger logger = LoggerFactory.getLogger(AbstractHdfsFileOutputOperator.class);
}