com.datatorrent.lib.io.fs.AbstractFileSplitter.java Source code

Java tutorial

Introduction

Here is the source code for com.datatorrent.lib.io.fs.AbstractFileSplitter.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.datatorrent.lib.io.fs;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;

import javax.annotation.Nullable;
import javax.validation.constraints.Min;
import javax.validation.constraints.NotNull;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;

import com.google.common.base.Preconditions;
import com.datatorrent.api.AutoMetric;
import com.datatorrent.api.Context;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.common.util.BaseOperator;
import com.datatorrent.lib.io.block.BlockMetadata;

/**
 * An abstract File Splitter.
 *
 * @since 3.2.0
 */
@org.apache.hadoop.classification.InterfaceStability.Evolving
public abstract class AbstractFileSplitter extends BaseOperator {
    protected Long blockSize;
    private int sequenceNo;

    /**
     * This is a threshold on the no. of blocks emitted per window. A lot of blocks emitted
     * per window can overwhelm the downstream operators. This setting helps to control that.
     */
    @Min(1)
    protected int blocksThreshold;

    protected transient long blockCount;

    protected BlockMetadataIterator blockMetadataIterator;

    protected transient int operatorId;
    protected transient Context.OperatorContext context;
    protected transient long currentWindowId;

    @AutoMetric
    protected int filesProcessed;

    public final transient DefaultOutputPort<FileMetadata> filesMetadataOutput = new DefaultOutputPort<>();
    public final transient DefaultOutputPort<BlockMetadata.FileBlockMetadata> blocksMetadataOutput = new DefaultOutputPort<>();

    @Override
    public void setup(Context.OperatorContext context) {
        Preconditions.checkArgument(blockSize == null || blockSize > 0, "invalid block size");

        operatorId = context.getId();
        this.context = context;
        currentWindowId = context.getValue(Context.OperatorContext.ACTIVATION_WINDOW_ID);
        if (blockSize == null) {
            blockSize = getDefaultBlockSize();
        }
    }

    @Override
    public void beginWindow(long windowId) {
        filesProcessed = 0;
        blockCount = 0;
        currentWindowId = windowId;
    }

    protected void process() {
        if (blockMetadataIterator != null && blockCount < blocksThreshold) {
            emitBlockMetadata();
        }

        FileInfo fileInfo;
        while (blockCount < blocksThreshold && (fileInfo = getFileInfo()) != null) {
            if (!processFileInfo(fileInfo)) {
                break;
            }
        }
    }

    /**
     * @return {@link FileInfo}
     */
    protected abstract FileInfo getFileInfo();

    /**
     * @param fileInfo file info
     * @return true if blocks threshold is reached; false otherwise
     */
    protected boolean processFileInfo(FileInfo fileInfo) {
        try {
            FileMetadata fileMetadata = buildFileMetadata(fileInfo);
            filesMetadataOutput.emit(fileMetadata);
            filesProcessed++;
            if (!fileMetadata.isDirectory()) {
                blockMetadataIterator = new BlockMetadataIterator(this, fileMetadata, blockSize);
                if (!emitBlockMetadata()) {
                    //block threshold reached
                    return false;
                }
            }
            return true;
        } catch (IOException e) {
            throw new RuntimeException("creating metadata", e);
        }
    }

    /**
     * @return true if all the blocks were emitted; false otherwise
     */
    protected boolean emitBlockMetadata() {
        while (blockMetadataIterator.hasNext()) {
            if (blockCount++ < blocksThreshold) {
                this.blocksMetadataOutput.emit(blockMetadataIterator.next());
            } else {
                return false;
            }
        }
        blockMetadataIterator = null;
        return true;
    }

    /**
     * Builds block metadata
     *
     * @param pos                 offset of the block
     * @param lengthOfFileInBlock length of the block in file
     * @param blockNumber         block number
     * @param fileMetadata        file metadata
     * @param isLast              last block of the file
     * @return block file metadata
     */
    protected BlockMetadata.FileBlockMetadata buildBlockMetadata(long pos, long lengthOfFileInBlock,
            int blockNumber, FileMetadata fileMetadata, boolean isLast) {
        BlockMetadata.FileBlockMetadata fileBlockMetadata = createBlockMetadata(fileMetadata);
        fileBlockMetadata.setBlockId(fileMetadata.getBlockIds()[blockNumber - 1]);
        fileBlockMetadata.setOffset(pos);
        fileBlockMetadata.setLength(lengthOfFileInBlock);
        fileBlockMetadata.setLastBlock(isLast);
        fileBlockMetadata.setPreviousBlockId(blockNumber == 1 ? -1 : fileMetadata.getBlockIds()[blockNumber - 2]);

        return fileBlockMetadata;
    }

    /**
     * Can be overridden for creating block metadata of a type that extends {@link BlockMetadata.FileBlockMetadata}
     */
    protected BlockMetadata.FileBlockMetadata createBlockMetadata(FileMetadata fileMetadata) {
        return new BlockMetadata.FileBlockMetadata(fileMetadata.getFilePath());
    }

    /**
     * Creates file-metadata and populates no. of blocks in the metadata.
     *
     * @param fileInfo file information
     * @return file-metadata
     * @throws IOException
     */
    protected FileMetadata buildFileMetadata(FileInfo fileInfo) throws IOException {
        LOG.debug("file {}", fileInfo.getFilePath());
        FileMetadata fileMetadata = createFileMetadata(fileInfo);
        LOG.debug("fileMetadata {}", fileMetadata);
        Path path = new Path(fileInfo.getFilePath());

        fileMetadata.setFileName(path.getName());

        FileStatus status = getFileStatus(path);
        fileMetadata.setDirectory(status.isDirectory());
        fileMetadata.setFileLength(status.getLen());

        if (fileInfo.getDirectoryPath() == null) { // Direct filename is given as input.
            fileMetadata.setRelativePath(status.getPath().getName());
        } else {
            String relativePath = getRelativePathWithFolderName(fileInfo);
            fileMetadata.setRelativePath(relativePath);
        }

        if (!status.isDirectory()) {
            int noOfBlocks = (int) ((status.getLen() / blockSize) + (((status.getLen() % blockSize) == 0) ? 0 : 1));
            if (fileMetadata.getDataOffset() >= status.getLen()) {
                noOfBlocks = 0;
            }
            fileMetadata.setNumberOfBlocks(noOfBlocks);
            populateBlockIds(fileMetadata);
        }
        return fileMetadata;
    }

    /*
     * As folder name was given to input for copy, prefix folder name to the sub items to copy.
     */
    private String getRelativePathWithFolderName(FileInfo fileInfo) {
        String parentDir = new Path(fileInfo.getDirectoryPath()).getName();
        return parentDir + File.separator + fileInfo.getRelativeFilePath();
    }

    /**
     * This can be over-ridden to create file metadata of type that extends {@link FileSplitterInput.FileMetadata}
     *
     * @param fileInfo file information
     * @return file-metadata
     */
    protected FileMetadata createFileMetadata(FileInfo fileInfo) {
        return new FileMetadata(fileInfo.getFilePath());
    }

    protected void populateBlockIds(FileMetadata fileMetadata) {
        // block ids are 32 bits of operatorId | 32 bits of sequence number
        long[] blockIds = new long[fileMetadata.getNumberOfBlocks()];
        long longLeftSide = ((long) operatorId) << 32;
        for (int i = 0; i < fileMetadata.getNumberOfBlocks(); i++) {
            blockIds[i] = longLeftSide | sequenceNo++ & 0xFFFFFFFFL;
        }
        fileMetadata.setBlockIds(blockIds);
    }

    /**
     * Get default block size which is used when the user hasn't specified block size.
     *
     * @return default block size.
     */
    protected abstract long getDefaultBlockSize();

    /**
     * Get status of a file.
     *
     * @param path path of a file
     * @return file status
     */
    protected abstract FileStatus getFileStatus(Path path) throws IOException;

    public void setBlockSize(Long blockSize) {
        this.blockSize = blockSize;
    }

    public Long getBlockSize() {
        return blockSize;
    }

    /**
     * Sets number of blocks to be emitted per window.<br/>
     * A lot of blocks emitted per window can overwhelm the downstream operators. Set this value considering blockSize and
     * readersCount.
     * @param threshold
     */
    public void setBlocksThreshold(int threshold) {
        this.blocksThreshold = threshold;
    }

    /**
     * Gets number of blocks to be emitted per window.<br/>
     * A lot of blocks emitted per window can overwhelm the downstream operators. Set this value considering blockSize and
     * readersCount.
     * @return
     */
    public int getBlocksThreshold() {
        return blocksThreshold;
    }

    /**
     * An {@link Iterator} for Block-Metadatas of a file.
     */
    protected static class BlockMetadataIterator implements Iterator<BlockMetadata.FileBlockMetadata> {
        private final FileMetadata fileMetadata;
        private final long blockSize;

        private long pos;
        private int blockNumber;

        private final AbstractFileSplitter splitter;

        protected BlockMetadataIterator() {
            //for kryo
            fileMetadata = null;
            blockSize = -1;
            splitter = null;
        }

        protected BlockMetadataIterator(AbstractFileSplitter splitter, FileMetadata fileMetadata, long blockSize) {
            this.splitter = splitter;
            this.fileMetadata = fileMetadata;
            this.blockSize = blockSize;
            this.pos = fileMetadata.getDataOffset();
            this.blockNumber = 0;
        }

        @Override
        public boolean hasNext() {
            return pos < fileMetadata.getFileLength();
        }

        @SuppressWarnings("StatementWithEmptyBody")
        @Override
        public BlockMetadata.FileBlockMetadata next() {
            long length;
            while ((length = blockSize * ++blockNumber) <= pos) {
            }
            boolean isLast = length >= fileMetadata.getFileLength();
            long lengthOfFileInBlock = isLast ? fileMetadata.getFileLength() : length;
            BlockMetadata.FileBlockMetadata fileBlock = splitter.buildBlockMetadata(pos, lengthOfFileInBlock,
                    blockNumber, fileMetadata, isLast);
            pos = lengthOfFileInBlock;
            return fileBlock;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("remove not supported");
        }
    }

    /**
     * Represents the file metadata - file path, name, no. of blocks, etc.
     */
    public static class FileMetadata {
        @NotNull
        private String filePath;
        private String fileName;
        private int numberOfBlocks;
        private long dataOffset;
        private long fileLength;
        private long discoverTime;
        private long[] blockIds;
        private boolean isDirectory;
        private String relativePath;

        @SuppressWarnings("unused")
        protected FileMetadata() {
            //for kryo
            filePath = null;
            discoverTime = System.currentTimeMillis();
        }

        /**
         * Constructs file metadata
         *
         * @param filePath file path
         */
        public FileMetadata(@NotNull String filePath) {
            this.filePath = filePath;
            discoverTime = System.currentTimeMillis();
        }

        protected FileMetadata(FileMetadata fileMetadata) {
            this();
            filePath = fileMetadata.filePath;
            fileName = fileMetadata.fileName;
            numberOfBlocks = fileMetadata.numberOfBlocks;
            dataOffset = fileMetadata.dataOffset;
            fileLength = fileMetadata.fileLength;
            discoverTime = fileMetadata.discoverTime;
            blockIds = fileMetadata.blockIds;
            isDirectory = fileMetadata.isDirectory;
            relativePath = fileMetadata.relativePath;
        }

        /**
         * Returns the total number of blocks.
         */
        public int getNumberOfBlocks() {
            return numberOfBlocks;
        }

        /**
         * Sets the total number of blocks.
         */
        public void setNumberOfBlocks(int numberOfBlocks) {
            this.numberOfBlocks = numberOfBlocks;
        }

        /**
         * Returns the file name.
         */
        public String getFileName() {
            return fileName;
        }

        /**
         * Sets the file name.
         */
        public void setFileName(String fileName) {
            this.fileName = fileName;
        }

        /**
         * Sets the file path.
         */
        public void setFilePath(String filePath) {
            this.filePath = filePath;
        }

        /**
         * Returns the file path.
         */
        public String getFilePath() {
            return filePath;
        }

        /**
         * Returns the data offset.
         */
        public long getDataOffset() {
            return dataOffset;
        }

        /**
         * Sets the data offset.
         */
        public void setDataOffset(long offset) {
            this.dataOffset = offset;
        }

        /**
         * Returns the file length.
         */
        public long getFileLength() {
            return fileLength;
        }

        /**
         * Sets the file length.
         */
        public void setFileLength(long fileLength) {
            this.fileLength = fileLength;
        }

        /**
         * Returns the file discover time.
         */
        public long getDiscoverTime() {
            return discoverTime;
        }

        /**
         * Sets the discover time.
         */
        public void setDiscoverTime(long discoverTime) {
            this.discoverTime = discoverTime;
        }

        /**
         * Returns the block ids associated with the file.
         */
        public long[] getBlockIds() {
            return blockIds;
        }

        /**
         * Sets the blocks ids of the file.
         */
        public void setBlockIds(long[] blockIds) {
            this.blockIds = blockIds;
        }

        /**
         * Sets whether the file metadata is a directory.
         */
        public void setDirectory(boolean isDirectory) {
            this.isDirectory = isDirectory;
        }

        /**
         * @return true if it is a directory; false otherwise.
         */
        public boolean isDirectory() {
            return isDirectory;
        }

        /**
         * Sets relative file path
         * @return relativePath
         */
        public String getRelativePath() {
            return relativePath;
        }

        /**
         * Gets relative file path
         * @param relativePath
         */
        public void setRelativePath(String relativePath) {
            this.relativePath = relativePath;
        }

        @Override
        public String toString() {
            return "FileMetadata [fileName=" + fileName + ", numberOfBlocks=" + numberOfBlocks + ", isDirectory="
                    + isDirectory + ", relativePath=" + relativePath + "]";
        }

    }

    /**
     * A class that encapsulates file path.
     */
    public static class FileInfo {
        protected final String directoryPath;
        protected final String relativeFilePath;

        protected FileInfo() {
            directoryPath = null;
            relativeFilePath = null;
        }

        public FileInfo(@Nullable String directoryPath, @NotNull String relativeFilePath) {
            this.directoryPath = directoryPath;
            this.relativeFilePath = relativeFilePath;
        }

        /**
         * @return directory path
         */
        public String getDirectoryPath() {
            return directoryPath;
        }

        /**
         * @return path relative to directory
         */
        public String getRelativeFilePath() {
            return relativeFilePath;
        }

        /**
         * @return full path of the file
         */
        public String getFilePath() {
            if (directoryPath == null) {
                return relativeFilePath;
            }
            return new Path(directoryPath, relativeFilePath).toUri().getPath();
        }
    }

    private static final Logger LOG = LoggerFactory.getLogger(AbstractFileSplitter.class);
}