List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize
public long getBlockSize()
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
public static <T extends TextSerializable, O extends TextSerializable> int sampleWithRatio(FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj, O outObj) throws IOException { FileStatus inFStatus = fs.getFileStatus(files[0]); if (inFStatus.isDir() || inFStatus.getLen() / inFStatus.getBlockSize() > 1) { // Either a directory of file or a large file return sampleMapReduceWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj); } else {//from w w w. j av a 2 s .co m // A single small file, process it without MapReduce return sampleLocalWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj); } }
From source file:com.sourcecode.FileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context/*from w w w . j ava 2s. c o m*/ * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { Stopwatch sw = new Stopwatch().start(); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { FileSystem fs = path.getFileSystem(job.getConfiguration()); blkLocations = fs.getFileBlockLocations(file, 0, length); } if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.elapsedMillis()); } return splits; }
From source file:com.streamsets.pipeline.stage.origin.hdfs.spooler.HdfsFile.java
License:Apache License
@SuppressWarnings("unchecked") public Map<String, Object> getFileMetadata() throws IOException { FileStatus file = fs.getFileStatus(filePath); Map<String, Object> metadata = new HashMap<>(); metadata.put(HeaderAttributeConstants.FILE_NAME, file.getPath().getName()); metadata.put(HeaderAttributeConstants.FILE, file.getPath().toUri().getPath()); metadata.put(HeaderAttributeConstants.LAST_MODIFIED_TIME, file.getModificationTime()); metadata.put(HeaderAttributeConstants.LAST_ACCESS_TIME, file.getAccessTime()); metadata.put(HeaderAttributeConstants.IS_DIRECTORY, file.isDirectory()); metadata.put(HeaderAttributeConstants.IS_SYMBOLIC_LINK, file.isSymlink()); metadata.put(HeaderAttributeConstants.SIZE, file.getLen()); metadata.put(HeaderAttributeConstants.OWNER, file.getOwner()); metadata.put(HeaderAttributeConstants.GROUP, file.getGroup()); metadata.put(HeaderAttributeConstants.BLOCK_SIZE, file.getBlockSize()); metadata.put(HeaderAttributeConstants.REPLICATION, file.getReplication()); metadata.put(HeaderAttributeConstants.IS_ENCRYPTED, file.isEncrypted()); FsPermission permission = file.getPermission(); if (permission != null) { metadata.put(PERMISSIONS, permission.toString()); }/*from w w w . j ava 2 s. com*/ return metadata; }
From source file:com.tripadvisor.hadoop.VerifyHdfsBackup.java
License:Apache License
/** * Method to go though the HDFS filesystem in a DFS to find all * files//from w ww. j a v a 2s . co m * * fs:FileSystem object from HDFS * maxDate:Newest date for files to be backed up * p:Path in HDFS to look for files **/ public void checkDir(FileSystem fs, Path p, String sLocalPathRoot, long maxDate) { FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, fStat[i].getPath(), sLocalPathRoot, maxDate); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // tripmonster to regular hive tables to partitioned // hive tables. We use table names to both exclude // some from the backup, and for the rest to dump out // the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { return; } // check the file FileStatus stat = fs.getFileStatus(p); // ignore files that are too new if ((stat.getModificationTime() / 1000) > maxDate) { System.out.println("IGNORING: " + sPath + " too new"); return; } // warn about files that have a mis-matching block // size. The checksum check will fail for them // anyways, so just catch it here. if (stat.getBlockSize() != N_BLOCK_SIZE) { System.out.println("ERROR: non-default block size (" + (stat.getBlockSize() / (1024 * 1024)) + "M) would fail checksum: " + sPath); return; } // get HDFS checksum FileChecksum ck = fs.getFileChecksum(p); String sCk, sCkShort; if (ck == null) { sCk = sCkShort = "<null>"; } else { sCk = ck.toString(); sCkShort = sCk.replaceAll("^.*:", ""); } System.out.println(sPath + " len=" + stat.getLen() + " " + stat.getOwner() + "/" + stat.getGroup() + " checksum=" + sCk); // find the local file String sFsPath = sLocalPathRoot + p.toUri().getPath(); File fLocal = new File(sFsPath); if (!fLocal.exists()) { Calendar cal = Calendar.getInstance(); cal.setTimeInMillis(stat.getModificationTime()); System.out.println("ERROR: file does not exist: " + sFsPath + " hdfs-last-mtime=" + cal.getTime().toString()); return; } if (!fLocal.isFile()) { System.out.println("ERROR: path is not a file: " + sFsPath); return; } if (stat.getLen() != fLocal.length()) { System.out.println("ERROR: length mismatch: " + sFsPath + " hdfslen=" + stat.getLen() + " fslen=" + fLocal.length()); return; } // get local fs checksum FileChecksum ckLocal = getLocalFileChecksum(sFsPath); if (ckLocal == null) { System.out.println("ERROR Failed to get checksum for local file " + sFsPath); return; } // compare checksums as a string, to strip the // algorithm name from the beginning String sCkLocal = ckLocal.toString(); String sCkLocalShort = sCkLocal.replaceAll("^.*:", ""); if (false == sCkShort.equals(sCkLocalShort)) { System.out.println( "ERROR: checksum mismatch: " + sFsPath + "\nhdfs = " + sCk + "\nlocal= " + sCkLocal); return; } } } catch (IOException e) { System.out.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }
From source file:com.uber.hoodie.common.table.timeline.dto.FileStatusDTO.java
License:Apache License
public static FileStatusDTO fromFileStatus(FileStatus fileStatus) { if (null == fileStatus) { return null; }//from www . j a v a 2s. com FileStatusDTO dto = new FileStatusDTO(); try { dto.path = FilePathDTO.fromPath(fileStatus.getPath()); dto.length = fileStatus.getLen(); dto.isdir = fileStatus.isDirectory(); dto.blockReplication = fileStatus.getReplication(); dto.blocksize = fileStatus.getBlockSize(); dto.modificationTime = fileStatus.getModificationTime(); dto.accessTime = fileStatus.getModificationTime(); dto.symlink = fileStatus.isSymlink() ? FilePathDTO.fromPath(fileStatus.getSymlink()) : null; safeReadAndSetMetadata(dto, fileStatus); } catch (IOException ioe) { throw new HoodieException(ioe); } return dto; }
From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java
License:Apache License
/** Splits files returned by {@link #listStatus(JobConf)} when * they're too big.*/// www. j a v a 2 s . co m @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { FileStatus[] files = listStatus(job); // Save the number of input files for metrics/loadgen job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); NetworkTopology clusterMap = new NetworkTopology(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts)); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(makeSplit(path, 0, length, splitHosts)); } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:com.wandisco.s3hdfs.rewrite.filter.S3HdfsTestUtil.java
License:Apache License
void compareS3ObjectWithHdfsFile(InputStream objectStream, Path path, long rangeStart, long rangeEnd) throws IOException, ServiceException { FileStatus fsStat = hdfs.listStatus(path)[0]; int expectedSize = (int) (rangeEnd - rangeStart); int blockSize = (int) fsStat.getBlockSize(); int blocks = (int) Math.ceil((double) expectedSize / (double) blockSize); DataInputStream origStream = hdfs.open(path); assertEquals(origStream.skip(rangeStart), rangeStart); int size = 0; for (int i = 0; i < expectedSize; i++) { int A = origStream.read(); int B = objectStream.read(); if (A == -1 || B == -1) fail("Premature end of steam."); if (A != B) { fail("ERROR: Byte A: " + A + " Byte B: " + B + ", at offset: " + size); }//from ww w .j a v a2s .c om size++; } if (size != expectedSize) { fail("Incorrect size: " + size + ", expected: " + expectedSize); } System.out.println("File: " + path + " has " + blocks + " blocks."); System.out.println("File: " + path + " has " + blockSize + " blockSize."); System.out.println("File: " + path + " has " + expectedSize + " length."); System.out.println("SUCCESS! The files match up!"); }
From source file:com.zjy.mongo.splitter.BSONSplitter.java
License:Apache License
public static long getSplitSize(final Configuration conf, final FileStatus file) { // Try new configuration options first, but fall back to old ones. long maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", conf.getLong("mapred.max.split.size", Long.MAX_VALUE)); long minSize = Math.max(1L, conf.getLong("mapreduce.input.fileinputformat.split.minsize", conf.getLong("mapred.min.split.size", 1L))); if (file != null) { long fileBlockSize = file.getBlockSize(); return Math.max(minSize, Math.min(maxSize, fileBlockSize)); } else {// www. j av a2s . c o m long blockSize = conf.getLong("dfs.blockSize", 64 * 1024 * 1024); return Math.max(minSize, Math.min(maxSize, blockSize)); } }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void fileStatusForFile() throws IOException { Path file = new Path("/dir/file"); // XXX new Path creates the file FileStatus stat = fs.getFileStatus(file); assertThat(stat.getPath().toUri().getPath(), is("/dir/file")); // XXX FileStatus.getPath().toUri() -> URI .getPath() assertThat(stat.isDir(), is(false)); // XXX assertThat(actual, Matcher) Matcher provides matches method assertThat(stat.getLen(), is(7L)); assertThat(stat.getModificationTime(), is(lessThanOrEqualTo(System.currentTimeMillis()))); assertThat(stat.getReplication(), is((short) 1)); // XXX Matcher<Short> is(Short value) -> o.h.core.Is.<Short>is(Short) static factory method from oh.core.Is // XXX which calls the constructor is(Matcher<Short> matcher) with the matcher equalTo(Short) hamcrest-all-1.3-source.java:Is.java:65 assertThat(stat.getBlockSize(), is(64 * 1024 * 1024L)); assertThat(stat.getOwner(), is(System.getProperty("user.name"))); assertThat(stat.getGroup(), is("supergroup")); assertThat(stat.getPermission().toString(), is("rw-r--r--")); }//from ww w . ja v a 2 s. c om
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void fileStatusForDirectory() throws IOException { Path dir = new Path("/dir"); // XXX new Path creates the directory FileStatus stat = fs.getFileStatus(dir); assertThat(stat.getPath().toUri().getPath(), is("/dir")); assertThat(stat.isDir(), is(true)); assertThat(stat.getLen(), is(0L)); assertThat(stat.getModificationTime(), is(lessThanOrEqualTo(System.currentTimeMillis()))); assertThat(stat.getReplication(), is((short) 0)); assertThat(stat.getBlockSize(), is(0L)); assertThat(stat.getOwner(), is(System.getProperty("user.name"))); assertThat(stat.getGroup(), is("supergroup")); assertThat(stat.getPermission().toString(), is("rwxr-xr-x")); }//from w w w .j a v a 2 s . co m