Example usage for org.apache.hadoop.fs FileStatus getBlockSize

List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getBlockSize.

Prototype

public long getBlockSize() 

Source Link

Document

Get the block size of the file.

Usage

From source file:org.apache.falcon.hadoop.JailedFileSystem.java

License:Apache License

@Override
public FileStatus getFileStatus(Path f) throws IOException {
    FileStatus status = localFS.getFileStatus(toLocalPath(f));
    if (status == null) {
        return null;
    }//www. j a va 2s.  c o m
    return new FileStatus(status.getLen(), status.isDirectory(), status.getReplication(), status.getBlockSize(),
            status.getModificationTime(), status.getAccessTime(), status.getPermission(), status.getOwner(),
            status.getGroup(),
            fromLocalPath(status.getPath()).makeQualified(this.getUri(), this.getWorkingDirectory()));
}

From source file:org.apache.giraph.io.formats.GiraphFileInputFormat.java

License:Apache License

/**
 * Common method for generating the list of vertex/edge input splits.
 *
 * @param job The job/*from   w w w . j a  va 2  s .co m*/
 * @param files Array of FileStatus objects for vertex/edge input files
 * @return The list of vertex/edge input splits
 * @throws IOException
 */
private List<InputSplit> getSplits(JobContext job, List<FileStatus> files) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();

    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    return splits;
}

From source file:org.apache.gobblin.data.management.copy.splitter.DistcpFileSplitterTest.java

License:Apache License

private Collection<WorkUnit> createMockSplitWorkUnits(FileSystem fs, long fileLen, long blockSize,
        long maxSplitSize) throws Exception {
    FileStatus file = mock(FileStatus.class);
    when(file.getLen()).thenReturn(fileLen);
    when(file.getBlockSize()).thenReturn(blockSize);

    URI uri = new URI("hdfs", "dummyhost", "/test", "test");
    Path path = new Path(uri);
    when(fs.getUri()).thenReturn(uri);/*from   w w  w .  j ava  2  s .  c  om*/

    CopyableDatasetMetadata cdm = new CopyableDatasetMetadata(new TestCopyableDataset(path));

    CopyableFile cf = CopyableFileUtils.getTestCopyableFile();
    CopyableFile spy = spy(cf);
    doReturn(file).when(spy).getFileStatus();
    doReturn(blockSize).when(spy).getBlockSize(any(FileSystem.class));
    doReturn(path).when(spy).getDestination();

    WorkUnit wu = WorkUnit.createEmpty();
    wu.setProp(DistcpFileSplitter.MAX_SPLIT_SIZE_KEY, maxSplitSize);
    wu.setProp(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR, 1, 0),
            path.toString());
    CopySource.setWorkUnitGuid(wu, Guid.fromStrings(wu.toString()));
    CopySource.serializeCopyEntity(wu, cf);
    CopySource.serializeCopyableDataset(wu, cdm);

    return DistcpFileSplitter.splitFile(spy, wu, fs);
}

From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter.java

License:Apache License

/**
 * Write the contents of input stream into staging path.
 *
 * <p>/*  w  ww . ja v  a2s.  c o m*/
 *   WriteAt indicates the path where the contents of the input stream should be written. When this method is called,
 *   the path writeAt.getParent() will exist already, but the path writeAt will not exist. When this method is returned,
 *   the path writeAt must exist. Any data written to any location other than writeAt or a descendant of writeAt
 *   will be ignored.
 * </p>
 *
 * @param inputStream {@link FSDataInputStream} whose contents should be written to staging path.
 * @param writeAt {@link Path} at which contents should be written.
 * @param copyableFile {@link org.apache.gobblin.data.management.copy.CopyEntity} that generated this copy operation.
 * @param record The actual {@link FileAwareInputStream} passed to the write method.
 * @throws IOException
 */
protected void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile,
        FileAwareInputStream record) throws IOException {

    final short replication = this.state.getPropAsShort(ConfigurationKeys.WRITER_FILE_REPLICATION_FACTOR,
            copyableFile.getReplication(this.fs));
    final long blockSize = copyableFile.getBlockSize(this.fs);
    final long fileSize = copyableFile.getFileStatus().getLen();

    long expectedBytes = fileSize;
    Long maxBytes = null;
    // Whether writer must write EXACTLY maxBytes.
    boolean mustMatchMaxBytes = false;

    if (record.getSplit().isPresent()) {
        maxBytes = record.getSplit().get().getHighPosition() - record.getSplit().get().getLowPosition();
        if (record.getSplit().get().isLastSplit()) {
            expectedBytes = fileSize % blockSize;
            mustMatchMaxBytes = false;
        } else {
            expectedBytes = maxBytes;
            mustMatchMaxBytes = true;
        }
    }

    Predicate<FileStatus> fileStatusAttributesFilter = new Predicate<FileStatus>() {
        @Override
        public boolean apply(FileStatus input) {
            return input.getReplication() == replication && input.getBlockSize() == blockSize;
        }
    };
    Optional<FileStatus> persistedFile = this.recoveryHelper.findPersistedFile(this.state, copyableFile,
            fileStatusAttributesFilter);

    if (persistedFile.isPresent()) {
        log.info(String.format("Recovering persisted file %s to %s.", persistedFile.get().getPath(), writeAt));
        this.fs.rename(persistedFile.get().getPath(), writeAt);
    } else {
        // Copy empty directories
        if (copyableFile.getFileStatus().isDirectory()) {
            this.fs.mkdirs(writeAt);
            return;
        }

        OutputStream os = this.fs.create(writeAt, true, this.fs.getConf().getInt("io.file.buffer.size", 4096),
                replication, blockSize);
        if (encryptionConfig != null) {
            os = EncryptionFactory.buildStreamCryptoProvider(encryptionConfig).encodeOutputStream(os);
        }
        try {
            FileSystem defaultFS = FileSystem.get(new Configuration());
            StreamThrottler<GobblinScopeTypes> throttler = this.taskBroker
                    .getSharedResource(new StreamThrottler.Factory<GobblinScopeTypes>(), new EmptyKey());
            ThrottledInputStream throttledInputStream = throttler.throttleInputStream().inputStream(inputStream)
                    .sourceURI(copyableFile.getOrigin().getPath()
                            .makeQualified(defaultFS.getUri(), defaultFS.getWorkingDirectory()).toUri())
                    .targetURI(this.fs.makeQualified(writeAt).toUri()).build();
            StreamCopier copier = new StreamCopier(throttledInputStream, os, maxBytes)
                    .withBufferSize(this.bufferSize);

            log.info("File {}: Starting copy", copyableFile.getOrigin().getPath());

            if (isInstrumentationEnabled()) {
                copier.withCopySpeedMeter(this.copySpeedMeter);
            }
            long numBytes = copier.copy();
            if ((this.checkFileSize || mustMatchMaxBytes) && numBytes != expectedBytes) {
                throw new IOException(String.format("Incomplete write: expected %d, wrote %d bytes.",
                        expectedBytes, numBytes));
            }
            this.bytesWritten.addAndGet(numBytes);
            if (isInstrumentationEnabled()) {
                log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(),
                        this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate());
            } else {
                log.info("File {} copied.", copyableFile.getOrigin().getPath());
            }
        } catch (NotConfiguredException nce) {
            log.warn("Broker error. Some features of stream copier may not be available.", nce);
        } finally {
            os.close();
            inputStream.close();
        }
    }
}

From source file:org.apache.hama.bsp.FileInputFormat.java

License:Apache License

/**
 * Splits files returned by {@link #listStatus(BSPJob)} when they're too big. <br/>
 * numSplits will be ignored by the framework.
 *//*from  w w  w.  j  a  v a 2  s  .c o m*/
@Override
public InputSplit[] getSplits(BSPJob job, int numSplits) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(job);

    /*
     * TODO: This does not consider data locality. When the numSplits
     * (user-defined) is equal to or smaller than the number of DFS splits, we
     * should assign multiple splits to a task.
     */

    // take the short circuit path if we have already partitioned
    // if (numSplits == files.length) {
    // for (FileStatus file : files) {
    // if (file != null) {
    // splits.add(new FileSplit(file.getPath(), 0, file.getLen(),
    // new String[0]));
    // }
    // }
    // return splits.toArray(new FileSplit[splits.size()]);
    // }

    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong("bsp.input.files", files.length);

    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new InputSplit[splits.size()]);
}

From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsAnalyzer.java

License:Apache License

/**
 * Collects a number of basic statistics based on an estimate. Statistics
 * are: number of records, number of hdfs blocks and hdfs block size.
 *
 * @param datapath path is a data source URI that can appear as a file name,
 *            a directory name or a wildcard pattern
 * @return statistics in JSON format//from w ww.  j a  v a  2s  . c  o  m
 * @throws Exception if path is wrong, its metadata cannot be retrieved from
 *             file system, or if scanning the first block using the
 *             accessor failed
 */
@Override
public AnalyzerStats getEstimatedStats(String datapath) throws Exception {
    long blockSize = 0;
    long numberOfBlocks;
    long dataSize = 0;
    Path path = new Path(HdfsUtilities.absoluteDataPath(datapath));

    ArrayList<InputSplit> splits = getSplits(path);

    for (InputSplit split : splits) {
        FileSplit fsp = (FileSplit) split;
        dataSize += fsp.getLength();
        if (blockSize == 0) {
            Path filePath = fsp.getPath();
            FileStatus fileStatus = fs.getFileStatus(filePath);
            if (fileStatus.isFile()) {
                blockSize = fileStatus.getBlockSize();
            }
        }
    }

    // if no file is in path (only dirs), get default block size
    if (blockSize == 0) {
        blockSize = fs.getDefaultBlockSize(path);
    }
    numberOfBlocks = splits.size();

    /*
     * The estimate of the number of tuples in table is based on the
     * actual number of tuples in the first block, multiplied by its
     * size compared to the size of the whole data to be read.
     * The calculation:
     * Ratio of tuples to size = number of tuples in first block / first block size.
     * Total of tuples = ratio * number of blocks * total block size.
     */
    long numberOfTuplesInBlock = getNumberOfTuplesInBlock(splits);
    long numberOfTuples = 0;
    if (!splits.isEmpty()) {
        long blockLength = splits.get(0).getLength();
        numberOfTuples = (long) Math.floor((((double) numberOfTuplesInBlock / blockLength) * (dataSize)));
    }
    // AnalyzerStats stats = new AnalyzerStats(blockSize, numberOfBlocks,
    AnalyzerStats stats = new AnalyzerStats(blockSize, numberOfBlocks, numberOfTuples);

    // print files size to log when in debug level
    Log.debug(AnalyzerStats.dataToString(stats, path.toString()));

    return stats;
}

From source file:org.apache.hive.common.util.MockFileSystem.java

License:Apache License

public void touch(MockFile file) {
    if (fileStatusMap.containsKey(file)) {
        FileStatus fileStatus = fileStatusMap.get(file);
        FileStatus fileStatusNew = new FileStatus(fileStatus.getLen(), fileStatus.isDirectory(),
                fileStatus.getReplication(), fileStatus.getBlockSize(), fileStatus.getModificationTime() + 1,
                fileStatus.getAccessTime(), fileStatus.getPermission(), fileStatus.getOwner(),
                fileStatus.getGroup(), fileStatus.getPath());
        fileStatusMap.put(file, fileStatusNew);
    }/*from   w  w  w.j  ava 2s. com*/
}

From source file:org.apache.ignite.hadoop.fs.IgniteHadoopIgfsSecondaryFileSystem.java

License:Apache License

/** {@inheritDoc} */
@Override//from w ww  .j  a  v  a2  s. c  om
public Collection<IgfsFile> listFiles(IgfsPath path) {
    try {
        FileStatus[] statuses = fileSys.listStatus(convert(path));

        if (statuses == null)
            throw new IgfsPathNotFoundException("Failed to list files (path not found): " + path);

        Collection<IgfsFile> res = new ArrayList<>(statuses.length);

        for (FileStatus status : statuses) {
            IgfsFileInfo fsInfo = status.isDirectory() ? new IgfsFileInfo(true, properties(status))
                    : new IgfsFileInfo((int) status.getBlockSize(), status.getLen(), null, null, false,
                            properties(status));

            res.add(new IgfsFileImpl(new IgfsPath(path, status.getPath().getName()), fsInfo, 1));
        }

        return res;
    } catch (FileNotFoundException ignored) {
        throw new IgfsPathNotFoundException("Failed to list files (path not found): " + path);
    } catch (IOException e) {
        throw handleSecondaryFsError(e,
                "Failed to list statuses due to secondary file system exception: " + path);
    }
}

From source file:org.apache.ignite.hadoop.fs.IgniteHadoopIgfsSecondaryFileSystem.java

License:Apache License

/** {@inheritDoc} */
@Override//from  w  w  w. java  2  s. c  o  m
public IgfsFile info(final IgfsPath path) {
    try {
        final FileStatus status = fileSys.getFileStatus(convert(path));

        if (status == null)
            return null;

        final Map<String, String> props = properties(status);

        return new IgfsFile() {
            @Override
            public IgfsPath path() {
                return path;
            }

            @Override
            public boolean isFile() {
                return status.isFile();
            }

            @Override
            public boolean isDirectory() {
                return status.isDirectory();
            }

            @Override
            public int blockSize() {
                // By convention directory has blockSize == 0, while file has blockSize > 0:
                return isDirectory() ? 0 : (int) status.getBlockSize();
            }

            @Override
            public long groupBlockSize() {
                return status.getBlockSize();
            }

            @Override
            public long accessTime() {
                return status.getAccessTime();
            }

            @Override
            public long modificationTime() {
                return status.getModificationTime();
            }

            @Override
            public String property(String name) throws IllegalArgumentException {
                String val = props.get(name);

                if (val == null)
                    throw new IllegalArgumentException(
                            "File property not found [path=" + path + ", name=" + name + ']');

                return val;
            }

            @Nullable
            @Override
            public String property(String name, @Nullable String dfltVal) {
                String val = props.get(name);

                return val == null ? dfltVal : val;
            }

            @Override
            public long length() {
                return status.getLen();
            }

            /** {@inheritDoc} */
            @Override
            public Map<String, String> properties() {
                return props;
            }
        };
    } catch (FileNotFoundException ignore) {
        return null;
    } catch (IOException e) {
        throw handleSecondaryFsError(e, "Failed to get file status [path=" + path + "]");
    }
}

From source file:org.apache.ignite.hadoop.fs.v1.IgniteHadoopFileSystem.java

License:Apache License

/**
 * Convert a file status obtained from the secondary file system to a status of the primary file system.
 *
 * @param status Secondary file system status.
 * @return Primary file system status./*from  w w  w .j  a  v a  2 s. com*/
 */
@SuppressWarnings("deprecation")
private FileStatus toPrimary(FileStatus status) {
    return status != null
            ? new FileStatus(status.getLen(), status.isDir(), status.getReplication(), status.getBlockSize(),
                    status.getModificationTime(), status.getAccessTime(), status.getPermission(),
                    status.getOwner(), status.getGroup(), toPrimary(status.getPath()))
            : null;
}