Example usage for org.apache.hadoop.fs FileStatus getBlockSize

List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getBlockSize.

Prototype

public long getBlockSize() 

Source Link

Document

Get the block size of the file.

Usage

From source file:org.apache.tajo.storage.FileTablespace.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *
 * @throws IOException/*from   ww  w . ja  v  a  2  s .c  o  m*/
 */
public List<Fragment> getSplits(String tableName, TableMeta meta, Schema schema, Path... inputs)
        throws IOException {
    // generate splits'

    List<Fragment> splits = Lists.newArrayList();
    List<Fragment> volumeSplits = Lists.newArrayList();
    List<BlockLocation> blockLocations = Lists.newArrayList();

    for (Path p : inputs) {
        ArrayList<FileStatus> files = Lists.newArrayList();
        if (fs.isFile(p)) {
            files.addAll(Lists.newArrayList(fs.getFileStatus(p)));
        } else {
            files.addAll(listStatus(p));
        }

        int previousSplitSize = splits.size();
        for (FileStatus file : files) {
            Path path = file.getPath();
            long length = file.getLen();
            if (length > 0) {
                // Get locations of blocks of file
                BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
                boolean splittable = isSplittable(meta, schema, path, file);
                if (blocksMetadataEnabled && fs instanceof DistributedFileSystem) {

                    if (splittable) {
                        for (BlockLocation blockLocation : blkLocations) {
                            volumeSplits.add(makeSplit(tableName, path, blockLocation));
                        }
                        blockLocations.addAll(Arrays.asList(blkLocations));

                    } else { // Non splittable
                        long blockSize = blkLocations[0].getLength();
                        if (blockSize >= length) {
                            blockLocations.addAll(Arrays.asList(blkLocations));
                            for (BlockLocation blockLocation : blkLocations) {
                                volumeSplits.add(makeSplit(tableName, path, blockLocation));
                            }
                        } else {
                            splits.add(makeNonSplit(tableName, path, 0, length, blkLocations));
                        }
                    }

                } else {
                    if (splittable) {

                        long minSize = Math.max(getMinSplitSize(), 1);

                        long blockSize = file.getBlockSize(); // s3n rest api contained block size but blockLocations is one
                        long splitSize = Math.max(minSize, blockSize);
                        long bytesRemaining = length;

                        // for s3
                        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                            int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                            splits.add(makeSplit(tableName, path, length - bytesRemaining, splitSize,
                                    blkLocations[blkIndex].getHosts()));
                            bytesRemaining -= splitSize;
                        }
                        if (bytesRemaining > 0) {
                            int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                            splits.add(makeSplit(tableName, path, length - bytesRemaining, bytesRemaining,
                                    blkLocations[blkIndex].getHosts()));
                        }
                    } else { // Non splittable
                        splits.add(makeNonSplit(tableName, path, 0, length, blkLocations));
                    }
                }
            } else {
                //for zero length files
                splits.add(makeSplit(tableName, path, 0, length));
            }
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("# of splits per partition: " + (splits.size() - previousSplitSize));
        }
    }

    // Combine original fileFragments with new VolumeId information
    setVolumeMeta(volumeSplits, blockLocations);
    splits.addAll(volumeSplits);
    LOG.info("Total # of splits: " + splits.size());
    return splits;
}

From source file:org.apache.tajo.storage.TestFileSystems.java

License:Apache License

@Test
public void testBlockSplit() throws IOException {

    Schema schema = new Schema();
    schema.addColumn("id", Type.INT4);
    schema.addColumn("age", Type.INT4);
    schema.addColumn("name", Type.TEXT);

    TableMeta meta = CatalogUtil.newTableMeta(StoreType.CSV);

    Tuple[] tuples = new Tuple[4];
    for (int i = 0; i < tuples.length; i++) {
        tuples[i] = new VTuple(3);
        tuples[i].put(new Datum[] { DatumFactory.createInt4(i), DatumFactory.createInt4(i + 32),
                DatumFactory.createText("name" + i) });
    }/*from ww w  .j  a  v a 2  s .  c om*/

    Path path = StorageUtil.concatPath(testDir, "testGetScannerAndAppender", "table.csv");
    fs.mkdirs(path.getParent());

    Appender appender = sm.getAppender(meta, schema, path);
    appender.init();
    for (Tuple t : tuples) {
        appender.addTuple(t);
    }
    appender.close();
    FileStatus fileStatus = fs.getFileStatus(path);

    List<Fragment> splits = sm.getSplits("table", meta, schema, path);
    int splitSize = (int) Math.ceil(fileStatus.getLen() / (double) fileStatus.getBlockSize());
    assertEquals(splitSize, splits.size());

    for (Fragment fragment : splits) {
        assertTrue(fragment.getLength() <= fileStatus.getBlockSize());
    }
}

From source file:org.cdlib.was.weari.pig.ArcListInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 *///from  w  w  w . j ava  2s.c o m
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:org.elasticsearch.hadoop.mr.NTFSLocalFileSystem.java

License:Apache License

@Override
public FileStatus getFileStatus(Path f) throws IOException {
    // it's the RawFS in place which messes things up as it dynamically returns the permissions...
    // workaround by doing a copy
    FileStatus fs = super.getFileStatus(f);

    // work-around for Hive 0.14
    if (SCRATCH_DIR.equals(f.toString())) {
        System.out.println("Faking scratch dir permissions on Windows...");

        return new FileStatus(fs.getLen(), fs.isDir(), fs.getReplication(), fs.getBlockSize(),
                fs.getModificationTime(), fs.getAccessTime(), SCRATCH_DIR_PERMS, fs.getOwner(), fs.getGroup(),
                fs.getPath());//from  ww w .  ja v  a  2  s . c  o m
        // this doesn't work since the RawFS impl has its own algo that does the lookup dynamically
        //fs.getPermission().fromShort((short) 777);
    }
    return fs;
}

From source file:org.exem.flamingo.agent.nn.hdfs.HdfsFileInfo.java

License:Apache License

public HdfsFileInfo(FileStatus fileStatus, ContentSummary contentSummary) {
    this.fullyQualifiedPath = fileStatus.getPath().toUri().getPath();
    this.filename = isEmpty(getFilename(fullyQualifiedPath)) ? getDirectoryName(fullyQualifiedPath)
            : getFilename(fullyQualifiedPath);
    this.length = fileStatus.isFile() ? fileStatus.getLen() : contentSummary.getLength();
    this.path = getPath(fullyQualifiedPath);
    this.directory = fileStatus.isDirectory();
    this.file = !fileStatus.isDirectory();
    this.owner = fileStatus.getOwner();
    this.group = fileStatus.getGroup();
    this.blockSize = fileStatus.getBlockSize();
    this.replication = fileStatus.getReplication();
    this.modificationTime = fileStatus.getModificationTime();
    if (contentSummary != null) {
        this.spaceConsumed = contentSummary.getSpaceConsumed();
        this.spaceQuota = contentSummary.getSpaceQuota();
        this.quota = contentSummary.getQuota();
        this.directoryCount = contentSummary.getDirectoryCount();
        this.fileCount = contentSummary.getFileCount();
    }/*from   w w  w  .  ja va  2s  .  c o  m*/
    this.accessTime = fileStatus.getAccessTime();
    this.permission = fileStatus.getPermission().toString();
}

From source file:org.gridgain.grid.kernal.ggfs.hadoop.GridGgfsHadoopFileSystemWrapper.java

License:Open Source License

/** {@inheritDoc} */
@Override/*www .  j  a v a  2 s . c om*/
public Collection<GridGgfsFile> listFiles(GridGgfsPath path) throws GridException {
    try {
        FileStatus[] statuses = fileSys.listStatus(convert(path));

        if (statuses == null)
            throw new GridGgfsFileNotFoundException("Failed to list files (path not found): " + path);

        Collection<GridGgfsFile> res = new ArrayList<>(statuses.length);

        for (FileStatus status : statuses) {
            GridGgfsFileInfo fsInfo = status.isDirectory() ? new GridGgfsFileInfo(true, properties(status))
                    : new GridGgfsFileInfo((int) status.getBlockSize(), status.getLen(), null, null, false,
                            properties(status));

            res.add(new GridGgfsFileImpl(new GridGgfsPath(path, status.getPath().getName()), fsInfo, 1));
        }

        return res;
    } catch (FileNotFoundException ignored) {
        throw new GridGgfsFileNotFoundException("Failed to list files (path not found): " + path);
    } catch (IOException e) {
        throw handleSecondaryFsError(e,
                "Failed to list statuses due to secondary file system exception: " + path);
    }
}

From source file:org.gridgain.grid.kernal.ggfs.hadoop.GridGgfsHadoopFileSystemWrapper.java

License:Open Source License

/** {@inheritDoc} */
@Override//from  ww w  . j av a2 s.  c  o  m
public GridGgfsFile info(final GridGgfsPath path) throws GridException {
    try {
        final FileStatus status = fileSys.getFileStatus(convert(path));

        if (status == null)
            return null;

        final Map<String, String> props = properties(status);

        return new GridGgfsFile() {
            @Override
            public GridGgfsPath path() {
                return path;
            }

            @Override
            public boolean isFile() {
                return status.isFile();
            }

            @Override
            public boolean isDirectory() {
                return status.isDirectory();
            }

            @Override
            public int blockSize() {
                return (int) status.getBlockSize();
            }

            @Override
            public long groupBlockSize() {
                return status.getBlockSize();
            }

            @Override
            public long accessTime() {
                return status.getAccessTime();
            }

            @Override
            public long modificationTime() {
                return status.getModificationTime();
            }

            @Override
            public String property(String name) throws IllegalArgumentException {
                String val = props.get(name);

                if (val == null)
                    throw new IllegalArgumentException(
                            "File property not found [path=" + path + ", name=" + name + ']');

                return val;
            }

            @Nullable
            @Override
            public String property(String name, @Nullable String dfltVal) {
                String val = props.get(name);

                return val == null ? dfltVal : val;
            }

            @Override
            public long length() {
                return status.getLen();
            }

            /** {@inheritDoc} */
            @Override
            public Map<String, String> properties() {
                return props;
            }
        };

    } catch (FileNotFoundException ignore) {
        return null;
    } catch (IOException e) {
        throw handleSecondaryFsError(e, "Failed to get file status [path=" + path + "]");
    }
}

From source file:org.hedera.io.input.WikiRevisionInputFormat.java

License:Apache License

/** 
 * This code is copied from StreamWikiDumpNewInputFormat.java by Yusuke Matsubara.
 * Thanks to Tu Meteora for adjusting the code to the new mapreduce framework
 * @param job the job context/*from  ww w  . jav  a  2 s  . c  o  m*/
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext jc) throws IOException {

    List<FileStatus> files = listStatus(jc);
    List<FileStatus> remainingFiles = new ArrayList<>();

    List<InputSplit> splits = new ArrayList<InputSplit>();
    long totalSize = 0;

    // New features: Load splits from the index
    // Check the index before performing the split on the physical files
    Configuration conf = jc.getConfiguration();

    String mapFile = conf.get(SPLIT_MAPFILE_LOC);
    MapFile.Reader reader = null;
    Text key = null;
    RevisionSplits val = new RevisionSplits();
    try {
        if (mapFile != null) {
            reader = new MapFile.Reader(new Path(mapFile + "/part-r-00000"), conf);
            key = new Text();
        }

        // check we have valid files
        for (FileStatus file : files) {
            if (file.isDirectory()) {
                throw new IOException("Not a file: " + file.getPath());
            }

            // if found in the index, load the splits into main memory, otherwise
            // add to remainings for next processing
            if (reader != null) {
                key.set(file.getPath().toString());
                if (reader.seek(key)) {
                    reader.get(key, val);
                    FileSplit[] spl = val.splits();
                    for (FileSplit sp : spl)
                        splits.add(sp);
                    continue;
                }
            }
            remainingFiles.add(file);
            totalSize += file.getLen();
        }
        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(jc));

        // 2014-06-06: Tuan _ I have to manually increase the file split size
        // here to cope with Wikipedia Revision .bz2 file - the decompressor
        // takes too long to run
        long goalSize = totalSize / 3;

        for (FileStatus file : remainingFiles) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            for (InputSplit x : getSplits(jc, file, splitSize))
                splits.add(x);
        }
    } finally {
        if (reader != null)
            reader.close();
    }

    return splits;
}

From source file:org.imageterrier.hadoop.mapreduce.PositionAwareSequenceFileInputFormat.java

License:Mozilla Public License

/** 
 * Generate the list of files and make them into FileSplits.
 *///w  w w  . j ava 2 s. co m
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    int splitnum = 0;

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);

                splits.add(new PositionAwareSplitWrapper<FileSplit>(new FileSplit(path, length - bytesRemaining,
                        splitSize, blkLocations[blkIndex].getHosts()), splitnum++));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new PositionAwareSplitWrapper<FileSplit>(new FileSplit(path, length - bytesRemaining,
                        bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()), splitnum++));
            }
        } else if (length != 0) {
            splits.add(new PositionAwareSplitWrapper<FileSplit>(
                    new FileSplit(path, 0, length, blkLocations[0].getHosts()), splitnum++));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new PositionAwareSplitWrapper<FileSplit>(new FileSplit(path, 0, length, new String[0]),
                    splitnum++));
        }
    }

    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:org.mrgeo.cmd.mrsimageinfo.MrsImageInfo.java

License:Apache License

private void printFileInfo(final Path pfile, PrintStream out) throws IOException {
    // TODO: The following is HDFS-sepcific; needs to be re-factored
    final FileSystem fs = pfile.getFileSystem(config);
    final FileStatus stat = fs.getFileStatus(pfile);

    out.print("    date: " + DateTimeFormat.shortDateTime().print(stat.getModificationTime()));
    out.println("  size: " + human(stat.getLen()));

    final FsPermission p = stat.getPermission();

    if (debug) {/*  ww  w.j  av  a 2s . c om*/
        out.print("    ");
        out.print(stat.isDir() ? "d" : "f");
        out.print(" u: " + stat.getOwner() + " (" + p.getUserAction().toString().toLowerCase() + ")");
        out.print(" g: " + stat.getGroup() + " (" + p.getGroupAction().toString().toLowerCase() + ")");
        out.print(" o: " + "(" + p.getOtherAction().toString().toLowerCase() + ")");

        out.print(" blk: " + human(stat.getBlockSize()));
        out.println(" repl: " + stat.getReplication());
    }
}