List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize
public long getBlockSize()
From source file:org.apache.falcon.hadoop.JailedFileSystem.java
License:Apache License
@Override public FileStatus getFileStatus(Path f) throws IOException { FileStatus status = localFS.getFileStatus(toLocalPath(f)); if (status == null) { return null; }//www. j a va 2s. c o m return new FileStatus(status.getLen(), status.isDirectory(), status.getReplication(), status.getBlockSize(), status.getModificationTime(), status.getAccessTime(), status.getPermission(), status.getOwner(), status.getGroup(), fromLocalPath(status.getPath()).makeQualified(this.getUri(), this.getWorkingDirectory())); }
From source file:org.apache.giraph.io.formats.GiraphFileInputFormat.java
License:Apache License
/** * Common method for generating the list of vertex/edge input splits. * * @param job The job/*from w w w . j a va 2 s .co m*/ * @param files Array of FileStatus objects for vertex/edge input files * @return The list of vertex/edge input splits * @throws IOException */ private List<InputSplit> getSplits(JobContext job, List<FileStatus> files) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } return splits; }
From source file:org.apache.gobblin.data.management.copy.splitter.DistcpFileSplitterTest.java
License:Apache License
private Collection<WorkUnit> createMockSplitWorkUnits(FileSystem fs, long fileLen, long blockSize, long maxSplitSize) throws Exception { FileStatus file = mock(FileStatus.class); when(file.getLen()).thenReturn(fileLen); when(file.getBlockSize()).thenReturn(blockSize); URI uri = new URI("hdfs", "dummyhost", "/test", "test"); Path path = new Path(uri); when(fs.getUri()).thenReturn(uri);/*from w w w . j ava 2 s . c om*/ CopyableDatasetMetadata cdm = new CopyableDatasetMetadata(new TestCopyableDataset(path)); CopyableFile cf = CopyableFileUtils.getTestCopyableFile(); CopyableFile spy = spy(cf); doReturn(file).when(spy).getFileStatus(); doReturn(blockSize).when(spy).getBlockSize(any(FileSystem.class)); doReturn(path).when(spy).getDestination(); WorkUnit wu = WorkUnit.createEmpty(); wu.setProp(DistcpFileSplitter.MAX_SPLIT_SIZE_KEY, maxSplitSize); wu.setProp(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR, 1, 0), path.toString()); CopySource.setWorkUnitGuid(wu, Guid.fromStrings(wu.toString())); CopySource.serializeCopyEntity(wu, cf); CopySource.serializeCopyableDataset(wu, cdm); return DistcpFileSplitter.splitFile(spy, wu, fs); }
From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter.java
License:Apache License
/** * Write the contents of input stream into staging path. * * <p>/* w ww . ja v a2s. c o m*/ * WriteAt indicates the path where the contents of the input stream should be written. When this method is called, * the path writeAt.getParent() will exist already, but the path writeAt will not exist. When this method is returned, * the path writeAt must exist. Any data written to any location other than writeAt or a descendant of writeAt * will be ignored. * </p> * * @param inputStream {@link FSDataInputStream} whose contents should be written to staging path. * @param writeAt {@link Path} at which contents should be written. * @param copyableFile {@link org.apache.gobblin.data.management.copy.CopyEntity} that generated this copy operation. * @param record The actual {@link FileAwareInputStream} passed to the write method. * @throws IOException */ protected void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile, FileAwareInputStream record) throws IOException { final short replication = this.state.getPropAsShort(ConfigurationKeys.WRITER_FILE_REPLICATION_FACTOR, copyableFile.getReplication(this.fs)); final long blockSize = copyableFile.getBlockSize(this.fs); final long fileSize = copyableFile.getFileStatus().getLen(); long expectedBytes = fileSize; Long maxBytes = null; // Whether writer must write EXACTLY maxBytes. boolean mustMatchMaxBytes = false; if (record.getSplit().isPresent()) { maxBytes = record.getSplit().get().getHighPosition() - record.getSplit().get().getLowPosition(); if (record.getSplit().get().isLastSplit()) { expectedBytes = fileSize % blockSize; mustMatchMaxBytes = false; } else { expectedBytes = maxBytes; mustMatchMaxBytes = true; } } Predicate<FileStatus> fileStatusAttributesFilter = new Predicate<FileStatus>() { @Override public boolean apply(FileStatus input) { return input.getReplication() == replication && input.getBlockSize() == blockSize; } }; Optional<FileStatus> persistedFile = this.recoveryHelper.findPersistedFile(this.state, copyableFile, fileStatusAttributesFilter); if (persistedFile.isPresent()) { log.info(String.format("Recovering persisted file %s to %s.", persistedFile.get().getPath(), writeAt)); this.fs.rename(persistedFile.get().getPath(), writeAt); } else { // Copy empty directories if (copyableFile.getFileStatus().isDirectory()) { this.fs.mkdirs(writeAt); return; } OutputStream os = this.fs.create(writeAt, true, this.fs.getConf().getInt("io.file.buffer.size", 4096), replication, blockSize); if (encryptionConfig != null) { os = EncryptionFactory.buildStreamCryptoProvider(encryptionConfig).encodeOutputStream(os); } try { FileSystem defaultFS = FileSystem.get(new Configuration()); StreamThrottler<GobblinScopeTypes> throttler = this.taskBroker .getSharedResource(new StreamThrottler.Factory<GobblinScopeTypes>(), new EmptyKey()); ThrottledInputStream throttledInputStream = throttler.throttleInputStream().inputStream(inputStream) .sourceURI(copyableFile.getOrigin().getPath() .makeQualified(defaultFS.getUri(), defaultFS.getWorkingDirectory()).toUri()) .targetURI(this.fs.makeQualified(writeAt).toUri()).build(); StreamCopier copier = new StreamCopier(throttledInputStream, os, maxBytes) .withBufferSize(this.bufferSize); log.info("File {}: Starting copy", copyableFile.getOrigin().getPath()); if (isInstrumentationEnabled()) { copier.withCopySpeedMeter(this.copySpeedMeter); } long numBytes = copier.copy(); if ((this.checkFileSize || mustMatchMaxBytes) && numBytes != expectedBytes) { throw new IOException(String.format("Incomplete write: expected %d, wrote %d bytes.", expectedBytes, numBytes)); } this.bytesWritten.addAndGet(numBytes); if (isInstrumentationEnabled()) { log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate()); } else { log.info("File {} copied.", copyableFile.getOrigin().getPath()); } } catch (NotConfiguredException nce) { log.warn("Broker error. Some features of stream copier may not be available.", nce); } finally { os.close(); inputStream.close(); } } }
From source file:org.apache.hama.bsp.FileInputFormat.java
License:Apache License
/** * Splits files returned by {@link #listStatus(BSPJob)} when they're too big. <br/> * numSplits will be ignored by the framework. *//*from w w w. j a v a 2 s .c o m*/ @Override public InputSplit[] getSplits(BSPJob job, int numSplits) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(job); /* * TODO: This does not consider data locality. When the numSplits * (user-defined) is equal to or smaller than the number of DFS splits, we * should assign multiple splits to a task. */ // take the short circuit path if we have already partitioned // if (numSplits == files.length) { // for (FileStatus file : files) { // if (file != null) { // splits.add(new FileSplit(file.getPath(), 0, file.getLen(), // new String[0])); // } // } // return splits.toArray(new FileSplit[splits.size()]); // } for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } // Save the number of input files in the job-conf job.getConfiguration().setLong("bsp.input.files", files.length); LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new InputSplit[splits.size()]); }
From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsAnalyzer.java
License:Apache License
/** * Collects a number of basic statistics based on an estimate. Statistics * are: number of records, number of hdfs blocks and hdfs block size. * * @param datapath path is a data source URI that can appear as a file name, * a directory name or a wildcard pattern * @return statistics in JSON format//from w ww. j a v a 2s . c o m * @throws Exception if path is wrong, its metadata cannot be retrieved from * file system, or if scanning the first block using the * accessor failed */ @Override public AnalyzerStats getEstimatedStats(String datapath) throws Exception { long blockSize = 0; long numberOfBlocks; long dataSize = 0; Path path = new Path(HdfsUtilities.absoluteDataPath(datapath)); ArrayList<InputSplit> splits = getSplits(path); for (InputSplit split : splits) { FileSplit fsp = (FileSplit) split; dataSize += fsp.getLength(); if (blockSize == 0) { Path filePath = fsp.getPath(); FileStatus fileStatus = fs.getFileStatus(filePath); if (fileStatus.isFile()) { blockSize = fileStatus.getBlockSize(); } } } // if no file is in path (only dirs), get default block size if (blockSize == 0) { blockSize = fs.getDefaultBlockSize(path); } numberOfBlocks = splits.size(); /* * The estimate of the number of tuples in table is based on the * actual number of tuples in the first block, multiplied by its * size compared to the size of the whole data to be read. * The calculation: * Ratio of tuples to size = number of tuples in first block / first block size. * Total of tuples = ratio * number of blocks * total block size. */ long numberOfTuplesInBlock = getNumberOfTuplesInBlock(splits); long numberOfTuples = 0; if (!splits.isEmpty()) { long blockLength = splits.get(0).getLength(); numberOfTuples = (long) Math.floor((((double) numberOfTuplesInBlock / blockLength) * (dataSize))); } // AnalyzerStats stats = new AnalyzerStats(blockSize, numberOfBlocks, AnalyzerStats stats = new AnalyzerStats(blockSize, numberOfBlocks, numberOfTuples); // print files size to log when in debug level Log.debug(AnalyzerStats.dataToString(stats, path.toString())); return stats; }
From source file:org.apache.hive.common.util.MockFileSystem.java
License:Apache License
public void touch(MockFile file) { if (fileStatusMap.containsKey(file)) { FileStatus fileStatus = fileStatusMap.get(file); FileStatus fileStatusNew = new FileStatus(fileStatus.getLen(), fileStatus.isDirectory(), fileStatus.getReplication(), fileStatus.getBlockSize(), fileStatus.getModificationTime() + 1, fileStatus.getAccessTime(), fileStatus.getPermission(), fileStatus.getOwner(), fileStatus.getGroup(), fileStatus.getPath()); fileStatusMap.put(file, fileStatusNew); }/*from w w w.j ava 2s. com*/ }
From source file:org.apache.ignite.hadoop.fs.IgniteHadoopIgfsSecondaryFileSystem.java
License:Apache License
/** {@inheritDoc} */ @Override//from w ww .j a v a2 s. c om public Collection<IgfsFile> listFiles(IgfsPath path) { try { FileStatus[] statuses = fileSys.listStatus(convert(path)); if (statuses == null) throw new IgfsPathNotFoundException("Failed to list files (path not found): " + path); Collection<IgfsFile> res = new ArrayList<>(statuses.length); for (FileStatus status : statuses) { IgfsFileInfo fsInfo = status.isDirectory() ? new IgfsFileInfo(true, properties(status)) : new IgfsFileInfo((int) status.getBlockSize(), status.getLen(), null, null, false, properties(status)); res.add(new IgfsFileImpl(new IgfsPath(path, status.getPath().getName()), fsInfo, 1)); } return res; } catch (FileNotFoundException ignored) { throw new IgfsPathNotFoundException("Failed to list files (path not found): " + path); } catch (IOException e) { throw handleSecondaryFsError(e, "Failed to list statuses due to secondary file system exception: " + path); } }
From source file:org.apache.ignite.hadoop.fs.IgniteHadoopIgfsSecondaryFileSystem.java
License:Apache License
/** {@inheritDoc} */ @Override//from w w w. java 2 s. c o m public IgfsFile info(final IgfsPath path) { try { final FileStatus status = fileSys.getFileStatus(convert(path)); if (status == null) return null; final Map<String, String> props = properties(status); return new IgfsFile() { @Override public IgfsPath path() { return path; } @Override public boolean isFile() { return status.isFile(); } @Override public boolean isDirectory() { return status.isDirectory(); } @Override public int blockSize() { // By convention directory has blockSize == 0, while file has blockSize > 0: return isDirectory() ? 0 : (int) status.getBlockSize(); } @Override public long groupBlockSize() { return status.getBlockSize(); } @Override public long accessTime() { return status.getAccessTime(); } @Override public long modificationTime() { return status.getModificationTime(); } @Override public String property(String name) throws IllegalArgumentException { String val = props.get(name); if (val == null) throw new IllegalArgumentException( "File property not found [path=" + path + ", name=" + name + ']'); return val; } @Nullable @Override public String property(String name, @Nullable String dfltVal) { String val = props.get(name); return val == null ? dfltVal : val; } @Override public long length() { return status.getLen(); } /** {@inheritDoc} */ @Override public Map<String, String> properties() { return props; } }; } catch (FileNotFoundException ignore) { return null; } catch (IOException e) { throw handleSecondaryFsError(e, "Failed to get file status [path=" + path + "]"); } }
From source file:org.apache.ignite.hadoop.fs.v1.IgniteHadoopFileSystem.java
License:Apache License
/** * Convert a file status obtained from the secondary file system to a status of the primary file system. * * @param status Secondary file system status. * @return Primary file system status./*from w w w .j a v a 2 s. com*/ */ @SuppressWarnings("deprecation") private FileStatus toPrimary(FileStatus status) { return status != null ? new FileStatus(status.getLen(), status.isDir(), status.getReplication(), status.getBlockSize(), status.getModificationTime(), status.getAccessTime(), status.getPermission(), status.getOwner(), status.getGroup(), toPrimary(status.getPath())) : null; }