Example usage for org.apache.hadoop.fs FileStatus getBlockSize

List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getBlockSize.

Prototype

public long getBlockSize() 

Source Link

Document

Get the block size of the file.

Usage

From source file:co.cask.cdap.data.stream.StreamDataFileSplitter.java

License:Apache License

/**
 * Compute the actual split size. The split size compute would be no larger than the given max split size.
 * The split size would be no smaller than the given min split size, except if number of bytes between
 * offset and file length is smaller than min split size.
 *
 * @param fileStatus The FileStatus of the file to split on.
 * @param offset Starting offset for the split.
 * @param minSplitSize Minimum size for the split.
 * @param maxSplitSize Maximum size for the split.
 * @return/* ww  w.  j a  v  a2s  .c  o  m*/
 */
private long computeSplitSize(FileStatus fileStatus, long offset, long minSplitSize, long maxSplitSize) {
    long blockSize = fileStatus.getBlockSize();
    long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
    return Math.min(splitSize, fileStatus.getLen() - offset);
}

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///from   w w w  .j a v a 2  s.  c  o  m
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.warn("test go go go");

    FileStatus[] files = listStatus(job);

    // Save the number of input files in the job-conf
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(new FileSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *
 * @param job/* w  ww  . java 2s.  c o  m*/
 *        The current BSPJob job
 * @return input splits
 */
@Override
public List<InputSplit> getSplits(BSPJob job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConf());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = 0L;
            if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize;
                } else {
                    splitSize = job.getSplitSize();
                }
            } else {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                } else {
                    splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                }
            }
            LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB");
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.info("[Split Number] " + splits.size());
    return splits;
}

From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java

License:Open Source License

private void testCreate(Path path, boolean override) throws Exception {
    Configuration conf = new Configuration();
    conf.set("fs.http.impl", HoopFileSystem.class.getName());
    FileSystem fs = FileSystem.get(getJettyURL().toURI(), conf);
    FsPermission permission = new FsPermission(FsAction.READ_WRITE, FsAction.NONE, FsAction.NONE);
    OutputStream os = fs.create(new Path(path.toUri().getPath()), permission, override, 1024, (short) 2,
            100 * 1024 * 1024, null);// w  ww .  j ava  2s  . c o m
    os.write(1);
    os.close();
    fs.close();

    fs = FileSystem.get(getHadoopConf());
    FileStatus status = fs.getFileStatus(path);
    Assert.assertEquals(status.getReplication(), 2);
    Assert.assertEquals(status.getBlockSize(), 100 * 1024 * 1024);
    Assert.assertEquals(status.getPermission(), permission);
    InputStream is = fs.open(path);
    Assert.assertEquals(is.read(), 1);
    is.close();
    fs.close();
}

From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java

License:Open Source License

private void testListStatus() throws Exception {
    FileSystem fs = FileSystem.get(getHadoopConf());
    Path path = new Path(getHadoopTestDir(), "foo.txt");
    OutputStream os = fs.create(path);
    os.write(1);/*from ww w.j  ava 2 s.c  o  m*/
    os.close();
    FileStatus status1 = fs.getFileStatus(path);
    fs.close();

    Configuration conf = new Configuration();
    conf.set("fs.http.impl", HoopFileSystem.class.getName());
    fs = FileSystem.get(getJettyURL().toURI(), conf);
    FileStatus status2 = fs.getFileStatus(new Path(path.toUri().getPath()));
    fs.close();

    Assert.assertEquals(status2.getPermission(), status1.getPermission());
    Assert.assertEquals(status2.getPath().toUri().getPath(), status1.getPath().toUri().getPath());
    Assert.assertEquals(status2.getReplication(), status1.getReplication());
    Assert.assertEquals(status2.getBlockSize(), status1.getBlockSize());
    Assert.assertEquals(status2.getAccessTime(), status1.getAccessTime());
    Assert.assertEquals(status2.getModificationTime(), status1.getModificationTime());
    Assert.assertEquals(status2.getOwner(), status1.getOwner());
    Assert.assertEquals(status2.getGroup(), status1.getGroup());
    Assert.assertEquals(status2.getLen(), status1.getLen());

    FileStatus[] stati = fs.listStatus(path.getParent());
    Assert.assertEquals(stati.length, 1);
    Assert.assertEquals(stati[0].getPath().getName(), path.getName());
}

From source file:com.cloudera.hoop.fs.FSUtils.java

License:Open Source License

/**
 * Converts a Hadoop <code>FileStatus</code> object into a JSON array
 * object. It replaces the <code>SCHEME://HOST:PORT</code> of the path
 * with the  specified URL./*w w  w.  j  a v  a  2 s. c  o  m*/
 * <p/>
 * @param status Hadoop file status.
 * @param hoopBaseUrl base URL to replace the
 * <code>SCHEME://HOST:PORT</code> in the file status.
 * @return The JSON representation of the file status.
 */
@SuppressWarnings("unchecked")
public static Map fileStatusToJSON(FileStatus status, String hoopBaseUrl) {
    Map json = new LinkedHashMap();
    json.put("path", convertPathToHoop(status.getPath(), hoopBaseUrl).toString());
    json.put("isDir", status.isDir());
    json.put("len", status.getLen());
    json.put("owner", status.getOwner());
    json.put("group", status.getGroup());
    json.put("permission", permissionToString(status.getPermission()));
    json.put("accessTime", status.getAccessTime());
    json.put("modificationTime", status.getModificationTime());
    json.put("blockSize", status.getBlockSize());
    json.put("replication", status.getReplication());
    return json;
}

From source file:com.cloudera.impala.util.LoadMetadataUtil.java

License:Apache License

/**
 * For filesystems that don't override getFileBlockLocations, synthesize file blocks by
 * manually splitting the file range into fixed-size blocks. That way, scan ranges can
 * be derived from file blocks as usual. All synthesized blocks are given an invalid
 * network address so that the scheduler will treat them as remote.
 *
 * Must be threadsafe. Access to 'hostIndex' must be protected.
 *//*from   ww  w  .j a v  a2 s  . co  m*/
private static void synthesizeBlockMetadata(FileStatus file, FileDescriptor fd, HdfsFileFormat fileFormat,
        ListMap<TNetworkAddress> hostIndex) {
    long start = 0;
    long remaining = fd.getFileLength();
    long blockSize = file.getBlockSize();
    if (blockSize < MIN_SYNTHETIC_BLOCK_SIZE)
        blockSize = MIN_SYNTHETIC_BLOCK_SIZE;
    if (!fileFormat.isSplittable(HdfsCompression.fromFileName(fd.getFileName()))) {
        blockSize = remaining;
    }
    while (remaining > 0) {
        long len = Math.min(remaining, blockSize);
        int idx = -1;
        synchronized (hostIndex) {
            idx = hostIndex.getIndex(REMOTE_NETWORK_ADDRESS);
        }
        List<BlockReplica> replicas = Lists.newArrayList(new BlockReplica(idx, false));
        fd.addFileBlock(new FileBlock(start, len, replicas));
        remaining -= len;
        start += len;
    }
}

From source file:com.conductor.s3.S3InputFormatUtils.java

License:Apache License

/**
 * Converts the {@link org.apache.hadoop.fs.FileStatus}s to {@link org.apache.hadoop.mapred.InputSplit}s (MRV1 API).
 * <p>//w  w w .ja va 2 s .c  om
 * This is taken directly from {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}, less any file system
 * operations that do not make sense when using {@code S3}.
 * 
 * @param files
 *            the files to convert
 * @param minSize
 *            the minimum size of the splits
 * @param maxSize
 *            the maximum size of the splits
 * @return the splits.
 */
static List<InputSplit> convertToInputSplitsMRV1(final Iterable<FileStatus> files, final long minSize,
        final long maxSize) {
    final List<InputSplit> splits = Lists.newArrayList();
    for (final FileStatus file : files) {
        // check for valid data for this input format
        checkArgument(!file.isDirectory(), "Cannot pass directories to this method!");
        final String path = file.getPath().toString();
        checkArgument(path.startsWith("s3:") || path.startsWith("s3n:"), "Expected S3 input");

        // create splits out of file
        final long length = file.getLen();
        if (length > 0) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                splits.add(new FileSplit(file.getPath(), length - bytesRemaining, splitSize, S3_SPLIT_HOST));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(
                        new FileSplit(file.getPath(), length - bytesRemaining, bytesRemaining, S3_SPLIT_HOST));
            }
        }
    }
    return splits;
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*from   ww w .ja va 2  s .c o m*/
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:com.edwardsit.spark4n6.EWFImageInputFormat.java

License:Apache License

protected long getChunksPerSplit(FileStatus file) {
    return file.getBlockSize() / chunkSize;
}