List of usage examples for org.apache.hadoop.fs FileStatus getBlockSize
public long getBlockSize()
From source file:co.cask.cdap.data.stream.StreamDataFileSplitter.java
License:Apache License
/** * Compute the actual split size. The split size compute would be no larger than the given max split size. * The split size would be no smaller than the given min split size, except if number of bytes between * offset and file length is smaller than min split size. * * @param fileStatus The FileStatus of the file to split on. * @param offset Starting offset for the split. * @param minSplitSize Minimum size for the split. * @param maxSplitSize Maximum size for the split. * @return/* ww w. j a v a2s .c o m*/ */ private long computeSplitSize(FileStatus fileStatus, long offset, long minSplitSize, long maxSplitSize) { long blockSize = fileStatus.getBlockSize(); long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize)); return Math.min(splitSize, fileStatus.getLen() - offset); }
From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java
License:Apache License
/** Splits files returned by {@link #listStatus(JobConf)} when * they're too big.*///from w w w .j a v a 2 s. c o m @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { LOG.warn("test go go go"); FileStatus[] files = listStatus(job); // Save the number of input files in the job-conf job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); NetworkTopology clusterMap = new NetworkTopology(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(new FileSplit(path, 0, length, splitHosts)); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * * @param job/* w ww . java 2s. c o m*/ * The current BSPJob job * @return input splits */ @Override public List<InputSplit> getSplits(BSPJob job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConf()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = 0L; if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) { if (job.getSplitSize() == 0L) { splitSize = blockSize; } else { splitSize = job.getSplitSize(); } } else { if (job.getSplitSize() == 0L) { splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1); } else { splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1); } } LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB"); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } LOG.info("[Split Number] " + splits.size()); return splits; }
From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java
License:Open Source License
private void testCreate(Path path, boolean override) throws Exception { Configuration conf = new Configuration(); conf.set("fs.http.impl", HoopFileSystem.class.getName()); FileSystem fs = FileSystem.get(getJettyURL().toURI(), conf); FsPermission permission = new FsPermission(FsAction.READ_WRITE, FsAction.NONE, FsAction.NONE); OutputStream os = fs.create(new Path(path.toUri().getPath()), permission, override, 1024, (short) 2, 100 * 1024 * 1024, null);// w ww . j ava 2s . c o m os.write(1); os.close(); fs.close(); fs = FileSystem.get(getHadoopConf()); FileStatus status = fs.getFileStatus(path); Assert.assertEquals(status.getReplication(), 2); Assert.assertEquals(status.getBlockSize(), 100 * 1024 * 1024); Assert.assertEquals(status.getPermission(), permission); InputStream is = fs.open(path); Assert.assertEquals(is.read(), 1); is.close(); fs.close(); }
From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java
License:Open Source License
private void testListStatus() throws Exception { FileSystem fs = FileSystem.get(getHadoopConf()); Path path = new Path(getHadoopTestDir(), "foo.txt"); OutputStream os = fs.create(path); os.write(1);/*from ww w.j ava 2 s.c o m*/ os.close(); FileStatus status1 = fs.getFileStatus(path); fs.close(); Configuration conf = new Configuration(); conf.set("fs.http.impl", HoopFileSystem.class.getName()); fs = FileSystem.get(getJettyURL().toURI(), conf); FileStatus status2 = fs.getFileStatus(new Path(path.toUri().getPath())); fs.close(); Assert.assertEquals(status2.getPermission(), status1.getPermission()); Assert.assertEquals(status2.getPath().toUri().getPath(), status1.getPath().toUri().getPath()); Assert.assertEquals(status2.getReplication(), status1.getReplication()); Assert.assertEquals(status2.getBlockSize(), status1.getBlockSize()); Assert.assertEquals(status2.getAccessTime(), status1.getAccessTime()); Assert.assertEquals(status2.getModificationTime(), status1.getModificationTime()); Assert.assertEquals(status2.getOwner(), status1.getOwner()); Assert.assertEquals(status2.getGroup(), status1.getGroup()); Assert.assertEquals(status2.getLen(), status1.getLen()); FileStatus[] stati = fs.listStatus(path.getParent()); Assert.assertEquals(stati.length, 1); Assert.assertEquals(stati[0].getPath().getName(), path.getName()); }
From source file:com.cloudera.hoop.fs.FSUtils.java
License:Open Source License
/** * Converts a Hadoop <code>FileStatus</code> object into a JSON array * object. It replaces the <code>SCHEME://HOST:PORT</code> of the path * with the specified URL./*w w w. j a v a 2 s. c o m*/ * <p/> * @param status Hadoop file status. * @param hoopBaseUrl base URL to replace the * <code>SCHEME://HOST:PORT</code> in the file status. * @return The JSON representation of the file status. */ @SuppressWarnings("unchecked") public static Map fileStatusToJSON(FileStatus status, String hoopBaseUrl) { Map json = new LinkedHashMap(); json.put("path", convertPathToHoop(status.getPath(), hoopBaseUrl).toString()); json.put("isDir", status.isDir()); json.put("len", status.getLen()); json.put("owner", status.getOwner()); json.put("group", status.getGroup()); json.put("permission", permissionToString(status.getPermission())); json.put("accessTime", status.getAccessTime()); json.put("modificationTime", status.getModificationTime()); json.put("blockSize", status.getBlockSize()); json.put("replication", status.getReplication()); return json; }
From source file:com.cloudera.impala.util.LoadMetadataUtil.java
License:Apache License
/** * For filesystems that don't override getFileBlockLocations, synthesize file blocks by * manually splitting the file range into fixed-size blocks. That way, scan ranges can * be derived from file blocks as usual. All synthesized blocks are given an invalid * network address so that the scheduler will treat them as remote. * * Must be threadsafe. Access to 'hostIndex' must be protected. *//*from ww w .j a v a2 s . co m*/ private static void synthesizeBlockMetadata(FileStatus file, FileDescriptor fd, HdfsFileFormat fileFormat, ListMap<TNetworkAddress> hostIndex) { long start = 0; long remaining = fd.getFileLength(); long blockSize = file.getBlockSize(); if (blockSize < MIN_SYNTHETIC_BLOCK_SIZE) blockSize = MIN_SYNTHETIC_BLOCK_SIZE; if (!fileFormat.isSplittable(HdfsCompression.fromFileName(fd.getFileName()))) { blockSize = remaining; } while (remaining > 0) { long len = Math.min(remaining, blockSize); int idx = -1; synchronized (hostIndex) { idx = hostIndex.getIndex(REMOTE_NETWORK_ADDRESS); } List<BlockReplica> replicas = Lists.newArrayList(new BlockReplica(idx, false)); fd.addFileBlock(new FileBlock(start, len, replicas)); remaining -= len; start += len; } }
From source file:com.conductor.s3.S3InputFormatUtils.java
License:Apache License
/** * Converts the {@link org.apache.hadoop.fs.FileStatus}s to {@link org.apache.hadoop.mapred.InputSplit}s (MRV1 API). * <p>//w w w .ja va 2 s .c om * This is taken directly from {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}, less any file system * operations that do not make sense when using {@code S3}. * * @param files * the files to convert * @param minSize * the minimum size of the splits * @param maxSize * the maximum size of the splits * @return the splits. */ static List<InputSplit> convertToInputSplitsMRV1(final Iterable<FileStatus> files, final long minSize, final long maxSize) { final List<InputSplit> splits = Lists.newArrayList(); for (final FileStatus file : files) { // check for valid data for this input format checkArgument(!file.isDirectory(), "Cannot pass directories to this method!"); final String path = file.getPath().toString(); checkArgument(path.startsWith("s3:") || path.startsWith("s3n:"), "Expected S3 input"); // create splits out of file final long length = file.getLen(); if (length > 0) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { splits.add(new FileSplit(file.getPath(), length - bytesRemaining, splitSize, S3_SPLIT_HOST)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit(file.getPath(), length - bytesRemaining, bytesRemaining, S3_SPLIT_HOST)); } } } return splits; }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context/*from ww w .ja va 2 s .c o m*/ * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { FileSystem fs = path.getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:com.edwardsit.spark4n6.EWFImageInputFormat.java
License:Apache License
protected long getChunksPerSplit(FileStatus file) { return file.getBlockSize() / chunkSize; }