co.cask.cdap.data.stream.StreamDataFileSplitter.java Source code

Introduction

Here is the source code for co.cask.cdap.data.stream.StreamDataFileSplitter.java
Source

/*
 * Copyright  2014 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package co.cask.cdap.data.stream;

import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;

import java.io.IOException;
import java.net.URI;
import java.util.List;

/**
 * Helper class for computing {@link InputSplit} for a stream data file.
 *
 * It splits a stream event file into equal size blocks (except the last block). The split size is computed by
 *
 * <br/><br/>
 * {@code Math.min(minSplitSize, Math.max(maxSplitSize, fileBlockSize)) }
 * <br/><br/>
 *
 * Each split produced will also carries {@code startTime} and {@code endTime} so that only stream events within
 * the given time range will get processed.
 */
final class StreamDataFileSplitter {

    private final FileStatus eventFileStatus;

    StreamDataFileSplitter(FileStatus eventFileStatus) {
        this.eventFileStatus = eventFileStatus;
    }

    /**
     * Computes splits for the event file.
     */
    <T> void computeSplits(FileSystem fs, long minSplitSize, long maxSplitSize, long startTime, long endTime,
            List<T> splits, StreamInputSplitFactory<T> splitFactory) throws IOException {

        // Compute the splits based on the min/max size
        Path eventFile = eventFileStatus.getPath();
        Path indexFile = getIndexFile(eventFile);

        BlockLocation[] blockLocations = fs.getFileBlockLocations(eventFile, 0, eventFileStatus.getLen());

        long length = eventFileStatus.getLen();
        long offset = 0;
        int blockIndex = 0;

        while (offset < length) {
            blockIndex = getBlockIndex(blockLocations, offset, blockIndex);
            String[] hosts = null;
            if (blockIndex >= 0) {
                hosts = blockLocations[blockIndex].getHosts();
            } else {
                blockIndex = 0;
            }

            long splitSize = computeSplitSize(eventFileStatus, offset, minSplitSize, maxSplitSize);
            splits.add(
                    splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, splitSize, hosts));
            offset += splitSize;
        }

        // One extra split for the tail of the file.
        splits.add(
                splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, Long.MAX_VALUE, null));
    }

    /**
     * Returns the array index of the given blockLocations that contains the given offset.
     *
     * @param blockLocations Array of {@link BlockLocation} to search for.
     * @param offset File offset.
     * @param startIdx Starting index for the search in the array.
     * @return The array index of the {@link BlockLocation} that contains the given offset.
     */
    private int getBlockIndex(BlockLocation[] blockLocations, long offset, int startIdx) {
        if (blockLocations == null) {
            return -1;
        }
        for (int i = startIdx; i < blockLocations.length; i++) {
            BlockLocation blockLocation = blockLocations[i];
            long endOffset = blockLocation.getOffset() + blockLocation.getLength();

            if (blockLocation.getOffset() <= offset && offset < endOffset) {
                return i;
            }
        }
        return -1;
    }

    /**
     * Compute the actual split size. The split size compute would be no larger than the given max split size.
     * The split size would be no smaller than the given min split size, except if number of bytes between
     * offset and file length is smaller than min split size.
     *
     * @param fileStatus The FileStatus of the file to split on.
     * @param offset Starting offset for the split.
     * @param minSplitSize Minimum size for the split.
     * @param maxSplitSize Maximum size for the split.
     * @return
     */
    private long computeSplitSize(FileStatus fileStatus, long offset, long minSplitSize, long maxSplitSize) {
        long blockSize = fileStatus.getBlockSize();
        long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
        return Math.min(splitSize, fileStatus.getLen() - offset);
    }

    private Path getIndexFile(Path eventFile) {
        String eventPath = eventFile.toUri().toString();
        int extLength = StreamFileType.EVENT.getSuffix().length();
        return new Path(URI.create(String.format("%s%s", eventPath.substring(0, eventPath.length() - extLength),
                StreamFileType.INDEX.getSuffix())));
    }
}