List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize
public long getBlockSize()
From source file:org.apache.tajo.storage.FileTablespace.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * * @throws IOException/*from ww w . ja v a 2 s .c o m*/ */ public List<Fragment> getSplits(String tableName, TableMeta meta, Schema schema, Path... inputs) throws IOException { // generate splits' List<Fragment> splits = Lists.newArrayList(); List<Fragment> volumeSplits = Lists.newArrayList(); List<BlockLocation> blockLocations = Lists.newArrayList(); for (Path p : inputs) { ArrayList<FileStatus> files = Lists.newArrayList(); if (fs.isFile(p)) { files.addAll(Lists.newArrayList(fs.getFileStatus(p))); } else { files.addAll(listStatus(p)); } int previousSplitSize = splits.size(); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length > 0) { // Get locations of blocks of file BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); boolean splittable = isSplittable(meta, schema, path, file); if (blocksMetadataEnabled && fs instanceof DistributedFileSystem) { if (splittable) { for (BlockLocation blockLocation : blkLocations) { volumeSplits.add(makeSplit(tableName, path, blockLocation)); } blockLocations.addAll(Arrays.asList(blkLocations)); } else { // Non splittable long blockSize = blkLocations[0].getLength(); if (blockSize >= length) { blockLocations.addAll(Arrays.asList(blkLocations)); for (BlockLocation blockLocation : blkLocations) { volumeSplits.add(makeSplit(tableName, path, blockLocation)); } } else { splits.add(makeNonSplit(tableName, path, 0, length, blkLocations)); } } } else { if (splittable) { long minSize = Math.max(getMinSplitSize(), 1); long blockSize = file.getBlockSize(); // s3n rest api contained block size but blockLocations is one long splitSize = Math.max(minSize, blockSize); long bytesRemaining = length; // for s3 while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(tableName, path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining > 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(tableName, path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // Non splittable splits.add(makeNonSplit(tableName, path, 0, length, blkLocations)); } } } else { //for zero length files splits.add(makeSplit(tableName, path, 0, length)); } } if (LOG.isDebugEnabled()) { LOG.debug("# of splits per partition: " + (splits.size() - previousSplitSize)); } } // Combine original fileFragments with new VolumeId information setVolumeMeta(volumeSplits, blockLocations); splits.addAll(volumeSplits); LOG.info("Total # of splits: " + splits.size()); return splits; }
From source file:org.apache.tajo.storage.TestFileSystems.java
License:Apache License
@Test public void testBlockSplit() throws IOException { Schema schema = new Schema(); schema.addColumn("id", Type.INT4); schema.addColumn("age", Type.INT4); schema.addColumn("name", Type.TEXT); TableMeta meta = CatalogUtil.newTableMeta(StoreType.CSV); Tuple[] tuples = new Tuple[4]; for (int i = 0; i < tuples.length; i++) { tuples[i] = new VTuple(3); tuples[i].put(new Datum[] { DatumFactory.createInt4(i), DatumFactory.createInt4(i + 32), DatumFactory.createText("name" + i) }); }/*from ww w .j a v a 2 s . c om*/ Path path = StorageUtil.concatPath(testDir, "testGetScannerAndAppender", "table.csv"); fs.mkdirs(path.getParent()); Appender appender = sm.getAppender(meta, schema, path); appender.init(); for (Tuple t : tuples) { appender.addTuple(t); } appender.close(); FileStatus fileStatus = fs.getFileStatus(path); List<Fragment> splits = sm.getSplits("table", meta, schema, path); int splitSize = (int) Math.ceil(fileStatus.getLen() / (double) fileStatus.getBlockSize()); assertEquals(splitSize, splits.size()); for (Fragment fragment : splits) { assertTrue(fragment.getLength() <= fileStatus.getBlockSize()); } }
From source file:org.cdlib.was.weari.pig.ArcListInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. *///from w w w . j ava 2s.c o m public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } // Save the number of input files in the job-conf job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:org.elasticsearch.hadoop.mr.NTFSLocalFileSystem.java
License:Apache License
@Override public FileStatus getFileStatus(Path f) throws IOException { // it's the RawFS in place which messes things up as it dynamically returns the permissions... // workaround by doing a copy FileStatus fs = super.getFileStatus(f); // work-around for Hive 0.14 if (SCRATCH_DIR.equals(f.toString())) { System.out.println("Faking scratch dir permissions on Windows..."); return new FileStatus(fs.getLen(), fs.isDir(), fs.getReplication(), fs.getBlockSize(), fs.getModificationTime(), fs.getAccessTime(), SCRATCH_DIR_PERMS, fs.getOwner(), fs.getGroup(), fs.getPath());//from ww w . ja v a 2 s . c o m // this doesn't work since the RawFS impl has its own algo that does the lookup dynamically //fs.getPermission().fromShort((short) 777); } return fs; }
From source file:org.exem.flamingo.agent.nn.hdfs.HdfsFileInfo.java
License:Apache License
public HdfsFileInfo(FileStatus fileStatus, ContentSummary contentSummary) { this.fullyQualifiedPath = fileStatus.getPath().toUri().getPath(); this.filename = isEmpty(getFilename(fullyQualifiedPath)) ? getDirectoryName(fullyQualifiedPath) : getFilename(fullyQualifiedPath); this.length = fileStatus.isFile() ? fileStatus.getLen() : contentSummary.getLength(); this.path = getPath(fullyQualifiedPath); this.directory = fileStatus.isDirectory(); this.file = !fileStatus.isDirectory(); this.owner = fileStatus.getOwner(); this.group = fileStatus.getGroup(); this.blockSize = fileStatus.getBlockSize(); this.replication = fileStatus.getReplication(); this.modificationTime = fileStatus.getModificationTime(); if (contentSummary != null) { this.spaceConsumed = contentSummary.getSpaceConsumed(); this.spaceQuota = contentSummary.getSpaceQuota(); this.quota = contentSummary.getQuota(); this.directoryCount = contentSummary.getDirectoryCount(); this.fileCount = contentSummary.getFileCount(); }/*from w w w . ja va 2s . c o m*/ this.accessTime = fileStatus.getAccessTime(); this.permission = fileStatus.getPermission().toString(); }
From source file:org.gridgain.grid.kernal.ggfs.hadoop.GridGgfsHadoopFileSystemWrapper.java
License:Open Source License
/** {@inheritDoc} */ @Override/*www . j a v a 2 s . c om*/ public Collection<GridGgfsFile> listFiles(GridGgfsPath path) throws GridException { try { FileStatus[] statuses = fileSys.listStatus(convert(path)); if (statuses == null) throw new GridGgfsFileNotFoundException("Failed to list files (path not found): " + path); Collection<GridGgfsFile> res = new ArrayList<>(statuses.length); for (FileStatus status : statuses) { GridGgfsFileInfo fsInfo = status.isDirectory() ? new GridGgfsFileInfo(true, properties(status)) : new GridGgfsFileInfo((int) status.getBlockSize(), status.getLen(), null, null, false, properties(status)); res.add(new GridGgfsFileImpl(new GridGgfsPath(path, status.getPath().getName()), fsInfo, 1)); } return res; } catch (FileNotFoundException ignored) { throw new GridGgfsFileNotFoundException("Failed to list files (path not found): " + path); } catch (IOException e) { throw handleSecondaryFsError(e, "Failed to list statuses due to secondary file system exception: " + path); } }
From source file:org.gridgain.grid.kernal.ggfs.hadoop.GridGgfsHadoopFileSystemWrapper.java
License:Open Source License
/** {@inheritDoc} */ @Override//from ww w . j av a2 s. c o m public GridGgfsFile info(final GridGgfsPath path) throws GridException { try { final FileStatus status = fileSys.getFileStatus(convert(path)); if (status == null) return null; final Map<String, String> props = properties(status); return new GridGgfsFile() { @Override public GridGgfsPath path() { return path; } @Override public boolean isFile() { return status.isFile(); } @Override public boolean isDirectory() { return status.isDirectory(); } @Override public int blockSize() { return (int) status.getBlockSize(); } @Override public long groupBlockSize() { return status.getBlockSize(); } @Override public long accessTime() { return status.getAccessTime(); } @Override public long modificationTime() { return status.getModificationTime(); } @Override public String property(String name) throws IllegalArgumentException { String val = props.get(name); if (val == null) throw new IllegalArgumentException( "File property not found [path=" + path + ", name=" + name + ']'); return val; } @Nullable @Override public String property(String name, @Nullable String dfltVal) { String val = props.get(name); return val == null ? dfltVal : val; } @Override public long length() { return status.getLen(); } /** {@inheritDoc} */ @Override public Map<String, String> properties() { return props; } }; } catch (FileNotFoundException ignore) { return null; } catch (IOException e) { throw handleSecondaryFsError(e, "Failed to get file status [path=" + path + "]"); } }
From source file:org.hedera.io.input.WikiRevisionInputFormat.java
License:Apache License
/** * This code is copied from StreamWikiDumpNewInputFormat.java by Yusuke Matsubara. * Thanks to Tu Meteora for adjusting the code to the new mapreduce framework * @param job the job context/*from ww w . jav a 2 s . c o m*/ * @throws IOException */ @Override public List<InputSplit> getSplits(JobContext jc) throws IOException { List<FileStatus> files = listStatus(jc); List<FileStatus> remainingFiles = new ArrayList<>(); List<InputSplit> splits = new ArrayList<InputSplit>(); long totalSize = 0; // New features: Load splits from the index // Check the index before performing the split on the physical files Configuration conf = jc.getConfiguration(); String mapFile = conf.get(SPLIT_MAPFILE_LOC); MapFile.Reader reader = null; Text key = null; RevisionSplits val = new RevisionSplits(); try { if (mapFile != null) { reader = new MapFile.Reader(new Path(mapFile + "/part-r-00000"), conf); key = new Text(); } // check we have valid files for (FileStatus file : files) { if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } // if found in the index, load the splits into main memory, otherwise // add to remainings for next processing if (reader != null) { key.set(file.getPath().toString()); if (reader.seek(key)) { reader.get(key, val); FileSplit[] spl = val.splits(); for (FileSplit sp : spl) splits.add(sp); continue; } } remainingFiles.add(file); totalSize += file.getLen(); } long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(jc)); // 2014-06-06: Tuan _ I have to manually increase the file split size // here to cope with Wikipedia Revision .bz2 file - the decompressor // takes too long to run long goalSize = totalSize / 3; for (FileStatus file : remainingFiles) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); for (InputSplit x : getSplits(jc, file, splitSize)) splits.add(x); } } finally { if (reader != null) reader.close(); } return splits; }
From source file:org.imageterrier.hadoop.mapreduce.PositionAwareSequenceFileInputFormat.java
License:Mozilla Public License
/** * Generate the list of files and make them into FileSplits. *///w w w . j ava 2 s. co m @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); int splitnum = 0; // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new PositionAwareSplitWrapper<FileSplit>(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()), splitnum++)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new PositionAwareSplitWrapper<FileSplit>(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()), splitnum++)); } } else if (length != 0) { splits.add(new PositionAwareSplitWrapper<FileSplit>( new FileSplit(path, 0, length, blkLocations[0].getHosts()), splitnum++)); } else { //Create empty hosts array for zero length files splits.add(new PositionAwareSplitWrapper<FileSplit>(new FileSplit(path, 0, length, new String[0]), splitnum++)); } } LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:org.mrgeo.cmd.mrsimageinfo.MrsImageInfo.java
License:Apache License
private void printFileInfo(final Path pfile, PrintStream out) throws IOException { // TODO: The following is HDFS-sepcific; needs to be re-factored final FileSystem fs = pfile.getFileSystem(config); final FileStatus stat = fs.getFileStatus(pfile); out.print(" date: " + DateTimeFormat.shortDateTime().print(stat.getModificationTime())); out.println(" size: " + human(stat.getLen())); final FsPermission p = stat.getPermission(); if (debug) {/* ww w.j av a 2s . c om*/ out.print(" "); out.print(stat.isDir() ? "d" : "f"); out.print(" u: " + stat.getOwner() + " (" + p.getUserAction().toString().toLowerCase() + ")"); out.print(" g: " + stat.getGroup() + " (" + p.getGroupAction().toString().toLowerCase() + ")"); out.print(" o: " + "(" + p.getOtherAction().toString().toLowerCase() + ")"); out.print(" blk: " + human(stat.getBlockSize())); out.println(" repl: " + stat.getReplication()); } }