com.datatorrent.lib.io.fs.FileSplitter.java Source code

Java tutorial

Introduction

Here is the source code for com.datatorrent.lib.io.fs.FileSplitter.java

Source

/*
 * Copyright (c) 2014 DataTorrent, Inc. ALL Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datatorrent.lib.io.fs;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;

import javax.validation.constraints.NotNull;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import com.datatorrent.api.Context;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.Operator;
import com.datatorrent.api.annotation.OperatorAnnotation;

import com.datatorrent.lib.io.IdempotentStorageManager;

/**
 * Input operator that scans a directory for files and splits a file into blocks.<br/>
 * The operator emits block metadata and file metadata.<br/>
 *
 * @displayName File Splitter
 * @category Input
 * @tags file, input operator
 *
 * @since 2.0.0
 */
@OperatorAnnotation(checkpointableWithinAppWindow = false)
public class FileSplitter extends AbstractFileInputOperator<FileSplitter.FileMetadata>
        implements Operator.CheckpointListener {
    protected Long blockSize;
    protected transient int operatorId;
    private int sequenceNo;

    @NotNull
    protected IdempotentStorageManager idempotentStorageManager;

    protected transient long currentWindowId;
    protected transient List<String> currentWindowRecoveryState;

    public FileSplitter() {
        processedFiles = Sets.newHashSet();
        pendingFiles = Sets.newLinkedHashSet();
        blockSize = null;
        idempotentStorageManager = new IdempotentStorageManager.FSIdempotentStorageManager();
        currentWindowRecoveryState = Lists.newArrayList();
    }

    public final transient DefaultOutputPort<FileMetadata> filesMetadataOutput = new DefaultOutputPort<FileMetadata>();
    public final transient DefaultOutputPort<BlockMetadata> blocksMetadataOutput = new DefaultOutputPort<BlockMetadata>();

    @Override
    public void setup(Context.OperatorContext context) {
        super.setup(context);
        assert blockSize == null || blockSize > 0 : "invalid block size";

        operatorId = context.getId();
        if (blockSize == null) {
            blockSize = fs.getDefaultBlockSize(filePath);
        }
        idempotentStorageManager.setup(context);
    }

    @Override
    public void beginWindow(long windowId) {
        currentWindowId = windowId;
        if (windowId <= idempotentStorageManager.getLargestRecoveryWindow()) {
            replay(windowId);
        }
        super.beginWindow(windowId);
    }

    protected void replay(long windowId) {
        //assumption is that FileSplitter is always statically partitioned. This operator doesn't do
        //much work therefore dynamic partitioning of it is not needed.

        try {
            @SuppressWarnings("unchecked")
            List<String> recoveredData = (List<String>) idempotentStorageManager.load(operatorId, windowId);
            if (recoveredData == null) {
                //This could happen when there are multiple physical instances and one of them is ahead in processing windows.
                return;
            }
            for (String recoveredPath : recoveredData) {
                processedFiles.add(recoveredPath);
                FileMetadata fileMetadata = buildFileMetadata(recoveredPath);
                filesMetadataOutput.emit(fileMetadata);
                Iterator<BlockMetadata> iterator = new BlockMetadataIterator(this, fileMetadata, blockSize);
                while (iterator.hasNext()) {
                    this.blocksMetadataOutput.emit(iterator.next());
                }
            }
        } catch (IOException e) {
            throw new RuntimeException("replay", e);
        }
    }

    @Override
    public void emitTuples() {
        if (currentWindowId <= idempotentStorageManager.getLargestRecoveryWindow()) {
            return;
        }
        //This adds the files to processed and pending sets
        scanDirectory();

        Iterator<String> pendingIterator = pendingFiles.iterator();
        while (pendingIterator.hasNext()) {
            String fPath = pendingIterator.next();
            currentWindowRecoveryState.add(fPath);
            LOG.debug("file {}", fPath);
            try {
                FileMetadata fileMetadata = buildFileMetadata(fPath);
                filesMetadataOutput.emit(fileMetadata);
                Iterator<BlockMetadata> iterator = new BlockMetadataIterator(this, fileMetadata, blockSize);
                while (iterator.hasNext()) {
                    this.blocksMetadataOutput.emit(iterator.next());
                }
            } catch (IOException e) {
                throw new RuntimeException("creating metadata", e);
            }
            pendingIterator.remove();
        }
    }

    @Override
    public void endWindow() {
        super.endWindow();
        if (currentWindowId > idempotentStorageManager.getLargestRecoveryWindow()) {
            try {
                idempotentStorageManager.save(currentWindowRecoveryState, operatorId, currentWindowId);
            } catch (IOException e) {
                throw new RuntimeException("saving recovery", e);
            }
        }
        currentWindowRecoveryState.clear();
    }

    @Override
    public void teardown() {
        super.teardown();
        idempotentStorageManager.teardown();
    }

    /**
     * Can be overridden for creating block metadata of a type that extends {@link BlockMetadata}
     */
    protected BlockMetadata createBlockMetadata(long pos, long lengthOfFileInBlock, int blockNumber,
            FileMetadata fileMetadata, boolean isLast) {
        return new BlockMetadata(pos, lengthOfFileInBlock, fileMetadata.getFilePath(),
                fileMetadata.getBlockIds()[blockNumber - 1], isLast);
    }

    @Override
    protected FileMetadata readEntity() throws IOException {
        return new FileMetadata(currentFile);
    }

    /**
     * Creates file metadata and populates no. of blocks in the metadata.
     *
     * @param fPath file-path
     * @return file-metadata
     * @throws IOException
     */
    protected FileMetadata buildFileMetadata(String fPath) throws IOException {
        currentFile = fPath;
        Path path = new Path(fPath);

        FileMetadata fileMetadata = readEntity();
        fileMetadata.setFileName(path.getName());

        FileStatus status = fs.getFileStatus(path);
        int noOfBlocks = (int) ((status.getLen() / blockSize) + (((status.getLen() % blockSize) == 0) ? 0 : 1));
        if (fileMetadata.getDataOffset() >= status.getLen()) {
            noOfBlocks = 0;
        }
        fileMetadata.setFileLength(status.getLen());
        fileMetadata.setNumberOfBlocks(noOfBlocks);
        populateBlockIds(fileMetadata);
        return fileMetadata;
    }

    protected void populateBlockIds(FileMetadata fileMetadata) {
        // block ids are 32 bits of operatorId | 32 bits of sequence number
        long[] blockIds = new long[fileMetadata.getNumberOfBlocks()];
        long longLeftSide = ((long) operatorId) << 32;
        for (int i = 0; i < fileMetadata.getNumberOfBlocks(); i++) {
            blockIds[i] = longLeftSide | sequenceNo++ & 0xFFFFFFFFL;
        }
        fileMetadata.setBlockIds(blockIds);
    }

    @Override
    protected void emit(FileMetadata tuple) {
        throw new UnsupportedOperationException("not supported");
    }

    @Override
    public void checkpointed(long windowId) {
    }

    @Override
    public void committed(long windowId) {
        try {
            idempotentStorageManager.deleteUpTo(operatorId, windowId);
        } catch (IOException e) {
            throw new RuntimeException("deleting state", e);
        }
    }

    public void setBlockSize(Long blockSize) {
        this.blockSize = blockSize;
    }

    public Long getBlockSize() {
        return blockSize;
    }

    public void setIdempotentStorageManager(IdempotentStorageManager idempotentStorageManager) {
        this.idempotentStorageManager = idempotentStorageManager;
    }

    public IdempotentStorageManager getIdempotentStorageManager() {
        return idempotentStorageManager;
    }

    /**
     * An {@link Iterator} for Block-Metadatas of a file.
     */
    public static class BlockMetadataIterator implements Iterator<BlockMetadata> {
        private final FileMetadata fileMetadata;
        private final long blockSize;

        private long pos;
        private int blockNumber;

        private FileSplitter splitter;

        public BlockMetadataIterator(FileSplitter splitter, FileMetadata fileMetadata, long blockSize) {
            this.splitter = splitter;
            this.fileMetadata = fileMetadata;
            this.blockSize = blockSize;
            this.pos = fileMetadata.getDataOffset();
            this.blockNumber = 0;
        }

        @Override
        public boolean hasNext() {
            return pos < fileMetadata.getFileLength();
        }

        @SuppressWarnings("StatementWithEmptyBody")
        @Override
        public BlockMetadata next() {
            long length;
            while ((length = blockSize * ++blockNumber) <= pos) {
            }
            boolean isLast = length >= fileMetadata.getFileLength();
            long lengthOfFileInBlock = isLast ? fileMetadata.getFileLength() : length;
            BlockMetadata blockMetadata = splitter.createBlockMetadata(pos, lengthOfFileInBlock, blockNumber,
                    fileMetadata, isLast);
            pos = lengthOfFileInBlock;
            return blockMetadata;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("remove not supported");
        }
    }

    /**
     * Represent the block metadata - file path, the file offset and length associated with the block and if it is the last
     * block of the file.
     */
    public static class BlockMetadata {
        private final long blockId;
        private final String filePath;
        //file offset associated with the block
        private long offset;
        //file length associated with the block
        private long length;
        private final boolean isLastBlock;

        protected BlockMetadata() {
            blockId = -1;
            filePath = null;
            offset = -1;
            length = -1;
            isLastBlock = false;
        }

        /**
         * Constructs Block metadata
         *
         * @param offset      offset of the file in the block
         * @param length      length of the file in the block
         * @param filePath    file path
         * @param blockId     block id
         * @param isLastBlock true if this is the last block of file
         */
        public BlockMetadata(long offset, long length, String filePath, long blockId, boolean isLastBlock) {
            this.filePath = filePath;
            this.blockId = blockId;
            this.offset = offset;
            this.length = length;
            this.isLastBlock = isLastBlock;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (!(o instanceof BlockMetadata)) {
                return false;
            }

            BlockMetadata that = (BlockMetadata) o;
            return blockId == that.blockId;
        }

        @Override
        public int hashCode() {
            return (int) blockId;
        }

        /**
         * Returns the file path.
         */
        public String getFilePath() {
            return filePath;
        }

        /**
         * Returns the block id.
         */
        public long getBlockId() {
            return blockId;
        }

        /**
         * Returns the file offset associated with the block.
         */
        public long getOffset() {
            return offset;
        }

        /**
         * Sets the offset of the file in the block.
         */
        public void setOffset(long offset) {
            this.offset = offset;
        }

        /**
         * Returns the length of the file in the block.
         */
        public long getLength() {
            return length;
        }

        /**
         * Sets the length of the file in the block.
         */
        public void setLength(long length) {
            this.length = length;
        }

        /**
         * Returns if this is the last block in file.
         */
        public boolean isLastBlock() {
            return isLastBlock;
        }
    }

    /**
     * Represents the file metadata - file path, name, no. of blocks, etc.
     */
    public static class FileMetadata {
        @NotNull
        private String filePath;
        private String fileName;
        private int numberOfBlocks;
        private long dataOffset;
        private long fileLength;
        private long discoverTime;
        private long[] blockIds;

        protected FileMetadata() {
            filePath = null;
            discoverTime = System.currentTimeMillis();
        }

        /**
         * Constructs file metadata
         *
         * @param filePath file path
         */
        public FileMetadata(@NotNull String filePath) {
            this.filePath = filePath;
            discoverTime = System.currentTimeMillis();
        }

        /**
         * Returns the total number of blocks.
         */
        public int getNumberOfBlocks() {
            return numberOfBlocks;
        }

        /**
         * Sets the total number of blocks.
         */
        public void setNumberOfBlocks(int numberOfBlocks) {
            this.numberOfBlocks = numberOfBlocks;
        }

        /**
         * Returns the file name.
         */
        public String getFileName() {
            return fileName;
        }

        /**
         * Sets the file name.
         */
        public void setFileName(String fileName) {
            this.fileName = fileName;
        }

        /**
         * Sets the file path.
         */
        public void setFilePath(String filePath) {
            this.filePath = filePath;
        }

        /**
         * Returns the file path.
         */
        public String getFilePath() {
            return filePath;
        }

        /**
         * Returns the data offset.
         */
        public long getDataOffset() {
            return dataOffset;
        }

        /**
         * Sets the data offset.
         */
        public void setDataOffset(long offset) {
            this.dataOffset = offset;
        }

        /**
         * Returns the file length.
         */
        public long getFileLength() {
            return fileLength;
        }

        /**
         * Sets the file length.
         */
        public void setFileLength(long fileLength) {
            this.fileLength = fileLength;
        }

        /**
         * Returns the file discover time.
         */
        public long getDiscoverTime() {
            return discoverTime;
        }

        /**
         * Sets the discover time.
         */
        public void setDiscoverTime(long discoverTime) {
            this.discoverTime = discoverTime;
        }

        /**
         * Returns the block ids associated with the file.
         */
        public long[] getBlockIds() {
            return blockIds;
        }

        /**
         * Sets the blocks ids of the file.
         */
        public void setBlockIds(long[] blockIds) {
            this.blockIds = blockIds;
        }
    }

    private static final Logger LOG = LoggerFactory.getLogger(FileSplitter.class);
}