Example usage for org.apache.hadoop.fs FileStatus getBlockSize

List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getBlockSize.

Prototype

public long getBlockSize() 

Source Link

Document

Get the block size of the file.

Usage

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

public static <T extends TextSerializable, O extends TextSerializable> int sampleWithRatio(FileSystem fs,
        Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj,
        O outObj) throws IOException {
    FileStatus inFStatus = fs.getFileStatus(files[0]);
    if (inFStatus.isDir() || inFStatus.getLen() / inFStatus.getBlockSize() > 1) {
        // Either a directory of file or a large file
        return sampleMapReduceWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj);
    } else {//from  w  w w. j av  a  2 s .co  m
        // A single small file, process it without MapReduce
        return sampleLocalWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj);
    }
}

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*from  w  w w  .  j  ava 2s. c  o m*/
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = new Stopwatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsedMillis());
    }
    return splits;
}

From source file:com.streamsets.pipeline.stage.origin.hdfs.spooler.HdfsFile.java

License:Apache License

@SuppressWarnings("unchecked")
public Map<String, Object> getFileMetadata() throws IOException {
    FileStatus file = fs.getFileStatus(filePath);
    Map<String, Object> metadata = new HashMap<>();
    metadata.put(HeaderAttributeConstants.FILE_NAME, file.getPath().getName());
    metadata.put(HeaderAttributeConstants.FILE, file.getPath().toUri().getPath());
    metadata.put(HeaderAttributeConstants.LAST_MODIFIED_TIME, file.getModificationTime());
    metadata.put(HeaderAttributeConstants.LAST_ACCESS_TIME, file.getAccessTime());
    metadata.put(HeaderAttributeConstants.IS_DIRECTORY, file.isDirectory());
    metadata.put(HeaderAttributeConstants.IS_SYMBOLIC_LINK, file.isSymlink());
    metadata.put(HeaderAttributeConstants.SIZE, file.getLen());
    metadata.put(HeaderAttributeConstants.OWNER, file.getOwner());
    metadata.put(HeaderAttributeConstants.GROUP, file.getGroup());
    metadata.put(HeaderAttributeConstants.BLOCK_SIZE, file.getBlockSize());
    metadata.put(HeaderAttributeConstants.REPLICATION, file.getReplication());
    metadata.put(HeaderAttributeConstants.IS_ENCRYPTED, file.isEncrypted());

    FsPermission permission = file.getPermission();
    if (permission != null) {
        metadata.put(PERMISSIONS, permission.toString());
    }/*from  w w  w .  j  ava  2  s.  com*/

    return metadata;
}

From source file:com.tripadvisor.hadoop.VerifyHdfsBackup.java

License:Apache License

/**
 * Method to go though the HDFS filesystem in a DFS to find all
 * files//from w  ww. j a v a 2s  .  co m
 *
 * fs:FileSystem object from HDFS
 * maxDate:Newest date for files to be backed up
 * p:Path in HDFS to look for files
 **/
public void checkDir(FileSystem fs, Path p, String sLocalPathRoot, long maxDate) {
    FileStatus[] fStat;

    try {
        String sPath = p.toUri().getPath();

        // If this is a directory
        if (fs.getFileStatus(p).isDir()) {
            // ignore certain directories
            if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName())
                    || sPath.startsWith("/mapred") || "ops".equals(p.getName())
                    || p.getName().startsWith("_distcp_logs")) {
                return;
            }

            fStat = fs.listStatus(p);

            // Do a recursive call to all elements
            for (int i = 0; i < fStat.length; i++) {
                checkDir(fs, fStat[i].getPath(), sLocalPathRoot, maxDate);
            }
        } else {
            // If not a directory then we've found a file

            // ignore crc files
            if (p.getName().endsWith(".crc")) {
                return;
            }

            // ignore other files
            if (sPath.startsWith("/user/oozie/etl/workflows/")) {
                return;
            }

            // try to get the table name from the path. There are
            // various types of tables, from those replicated from
            // tripmonster to regular hive tables to partitioned
            // hive tables.  We use table names to both exclude
            // some from the backup, and for the rest to dump out
            // the schema and partition name.
            if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) {
                return;
            }

            // check the file
            FileStatus stat = fs.getFileStatus(p);

            // ignore files that are too new
            if ((stat.getModificationTime() / 1000) > maxDate) {
                System.out.println("IGNORING: " + sPath + " too new");
                return;
            }

            // warn about files that have a mis-matching block
            // size.  The checksum check will fail for them
            // anyways, so just catch it here.
            if (stat.getBlockSize() != N_BLOCK_SIZE) {
                System.out.println("ERROR: non-default block size (" + (stat.getBlockSize() / (1024 * 1024))
                        + "M) would fail checksum: " + sPath);
                return;
            }

            // get HDFS checksum
            FileChecksum ck = fs.getFileChecksum(p);
            String sCk, sCkShort;
            if (ck == null) {
                sCk = sCkShort = "<null>";
            } else {
                sCk = ck.toString();
                sCkShort = sCk.replaceAll("^.*:", "");
            }

            System.out.println(sPath + " len=" + stat.getLen() + " " + stat.getOwner() + "/" + stat.getGroup()
                    + " checksum=" + sCk);

            // find the local file
            String sFsPath = sLocalPathRoot + p.toUri().getPath();
            File fLocal = new File(sFsPath);
            if (!fLocal.exists()) {
                Calendar cal = Calendar.getInstance();
                cal.setTimeInMillis(stat.getModificationTime());

                System.out.println("ERROR: file does not exist: " + sFsPath + " hdfs-last-mtime="
                        + cal.getTime().toString());
                return;
            }
            if (!fLocal.isFile()) {
                System.out.println("ERROR: path is not a file: " + sFsPath);
                return;
            }
            if (stat.getLen() != fLocal.length()) {
                System.out.println("ERROR: length mismatch: " + sFsPath + " hdfslen=" + stat.getLen()
                        + " fslen=" + fLocal.length());
                return;
            }

            // get local fs checksum
            FileChecksum ckLocal = getLocalFileChecksum(sFsPath);
            if (ckLocal == null) {
                System.out.println("ERROR Failed to get checksum for local file " + sFsPath);
                return;
            }

            // compare checksums as a string, to strip the
            // algorithm name from the beginning
            String sCkLocal = ckLocal.toString();
            String sCkLocalShort = sCkLocal.replaceAll("^.*:", "");

            if (false == sCkShort.equals(sCkLocalShort)) {
                System.out.println(
                        "ERROR: checksum mismatch: " + sFsPath + "\nhdfs = " + sCk + "\nlocal= " + sCkLocal);
                return;
            }
        }
    } catch (IOException e) {
        System.out.println("ERROR: could not open " + p + ": " + e);

        // System.exit(1) ;
    }
}

From source file:com.uber.hoodie.common.table.timeline.dto.FileStatusDTO.java

License:Apache License

public static FileStatusDTO fromFileStatus(FileStatus fileStatus) {
    if (null == fileStatus) {
        return null;
    }//from   www . j a  v  a  2s. com

    FileStatusDTO dto = new FileStatusDTO();
    try {
        dto.path = FilePathDTO.fromPath(fileStatus.getPath());
        dto.length = fileStatus.getLen();
        dto.isdir = fileStatus.isDirectory();
        dto.blockReplication = fileStatus.getReplication();
        dto.blocksize = fileStatus.getBlockSize();
        dto.modificationTime = fileStatus.getModificationTime();
        dto.accessTime = fileStatus.getModificationTime();
        dto.symlink = fileStatus.isSymlink() ? FilePathDTO.fromPath(fileStatus.getSymlink()) : null;
        safeReadAndSetMetadata(dto, fileStatus);
    } catch (IOException ioe) {
        throw new HoodieException(ioe);
    }
    return dto;
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///  www. j a  v  a  2  s .  co  m
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    FileStatus[] files = listStatus(job);

    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1),
            minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, bytesRemaining,
                        clusterMap);
                splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(makeSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.wandisco.s3hdfs.rewrite.filter.S3HdfsTestUtil.java

License:Apache License

void compareS3ObjectWithHdfsFile(InputStream objectStream, Path path, long rangeStart, long rangeEnd)
        throws IOException, ServiceException {
    FileStatus fsStat = hdfs.listStatus(path)[0];
    int expectedSize = (int) (rangeEnd - rangeStart);
    int blockSize = (int) fsStat.getBlockSize();
    int blocks = (int) Math.ceil((double) expectedSize / (double) blockSize);

    DataInputStream origStream = hdfs.open(path);
    assertEquals(origStream.skip(rangeStart), rangeStart);

    int size = 0;

    for (int i = 0; i < expectedSize; i++) {
        int A = origStream.read();
        int B = objectStream.read();
        if (A == -1 || B == -1)
            fail("Premature end of steam.");
        if (A != B) {
            fail("ERROR: Byte A: " + A + " Byte B: " + B + ", at offset: " + size);
        }//from ww  w .j  a v  a2s  .c om
        size++;
    }
    if (size != expectedSize) {
        fail("Incorrect size: " + size + ", expected: " + expectedSize);
    }

    System.out.println("File: " + path + " has " + blocks + " blocks.");
    System.out.println("File: " + path + " has " + blockSize + " blockSize.");
    System.out.println("File: " + path + " has " + expectedSize + " length.");

    System.out.println("SUCCESS! The files match up!");
}

From source file:com.zjy.mongo.splitter.BSONSplitter.java

License:Apache License

public static long getSplitSize(final Configuration conf, final FileStatus file) {
    // Try new configuration options first, but fall back to old ones.
    long maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize",
            conf.getLong("mapred.max.split.size", Long.MAX_VALUE));
    long minSize = Math.max(1L, conf.getLong("mapreduce.input.fileinputformat.split.minsize",
            conf.getLong("mapred.min.split.size", 1L)));

    if (file != null) {
        long fileBlockSize = file.getBlockSize();
        return Math.max(minSize, Math.min(maxSize, fileBlockSize));
    } else {//  www.  j  av  a2s . c  o  m
        long blockSize = conf.getLong("dfs.blockSize", 64 * 1024 * 1024);
        return Math.max(minSize, Math.min(maxSize, blockSize));
    }
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void fileStatusForFile() throws IOException {
        Path file = new Path("/dir/file"); // XXX new Path creates the file
        FileStatus stat = fs.getFileStatus(file);
        assertThat(stat.getPath().toUri().getPath(), is("/dir/file")); // XXX FileStatus.getPath().toUri() -> URI .getPath()
        assertThat(stat.isDir(), is(false)); // XXX assertThat(actual, Matcher) Matcher provides matches method
        assertThat(stat.getLen(), is(7L));
        assertThat(stat.getModificationTime(), is(lessThanOrEqualTo(System.currentTimeMillis())));
        assertThat(stat.getReplication(), is((short) 1)); // XXX Matcher<Short> is(Short value) -> o.h.core.Is.<Short>is(Short) static factory method from oh.core.Is
        // XXX which calls the constructor is(Matcher<Short> matcher) with the matcher equalTo(Short) hamcrest-all-1.3-source.java:Is.java:65
        assertThat(stat.getBlockSize(), is(64 * 1024 * 1024L));
        assertThat(stat.getOwner(), is(System.getProperty("user.name")));
        assertThat(stat.getGroup(), is("supergroup"));
        assertThat(stat.getPermission().toString(), is("rw-r--r--"));
    }//from   ww  w  . ja v a  2 s. c om

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void fileStatusForDirectory() throws IOException {
        Path dir = new Path("/dir"); // XXX new Path creates the directory
        FileStatus stat = fs.getFileStatus(dir);
        assertThat(stat.getPath().toUri().getPath(), is("/dir"));
        assertThat(stat.isDir(), is(true));
        assertThat(stat.getLen(), is(0L));
        assertThat(stat.getModificationTime(), is(lessThanOrEqualTo(System.currentTimeMillis())));
        assertThat(stat.getReplication(), is((short) 0));
        assertThat(stat.getBlockSize(), is(0L));
        assertThat(stat.getOwner(), is(System.getProperty("user.name")));
        assertThat(stat.getGroup(), is("supergroup"));
        assertThat(stat.getPermission().toString(), is("rwxr-xr-x"));
    }//from  w  w  w  .j  a v a  2  s . co m