List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:com.sourcecode.FileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context/*from w w w.j av a 2s . co m*/ * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { Stopwatch sw = new Stopwatch().start(); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { FileSystem fs = path.getFileSystem(job.getConfiguration()); blkLocations = fs.getFileBlockLocations(file, 0, length); } if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.elapsedMillis()); } return splits; }
From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java
License:Apache License
/** Splits files returned by {@link #listStatus(JobConf)} when * they're too big.*/// w w w. j a v a 2 s .c o m @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { FileStatus[] files = listStatus(job); // Save the number of input files for metrics/loadgen job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); NetworkTopology clusterMap = new NetworkTopology(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts)); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(makeSplit(path, 0, length, splitHosts)); } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:com.zjy.mongo.splitter.BSONSplitter.java
License:Apache License
public BSONFileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart, final long splitLen) { BSONFileSplit split;/* w ww . java 2 s .c o m*/ try { BlockLocation[] blkLocations; // This code is based off of org.apache.hadoop.mapreduce.lib // .input.FileInputFormat.getSplits() if (inFile instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) inFile).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen); } int blockIndex = getBlockIndex(blkLocations, splitStart); split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts()); } catch (IOException e) { LOG.warn( "Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; " + e.getMessage()); split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, null); } split.setKeyField(MongoConfigUtil.getInputKey(getConf())); return split; }
From source file:edu.iu.common.MultiFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { // Generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration(); int numMaps = jobConf.getNumMapTasks(); LOG.info("NUMBER OF FILES: " + files.size()); LOG.info("NUMBER OF MAPS: " + numMaps); int avg = files.size() / numMaps; int rest = files.size() % numMaps; int tmp = 0;/*from w ww .ja va2 s .c o m*/ long length = 0; List<Path> pathList = null; Set<String> hostSet = null; // Random random = new Random(System.nanoTime()); for (FileStatus file : files) { if (tmp == 0) { pathList = new ArrayList<Path>(); hostSet = new HashSet<String>(); } if (tmp < avg) { pathList.add(file.getPath()); length = length + file.getLen(); FileSystem fs = file.getPath().getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen()); for (BlockLocation blockLocation : blkLocations) { for (String host : blockLocation.getHosts()) { hostSet.add(host); } } tmp++; if (tmp == avg && rest == 0) { LOG.info("Split on host: " + getHostsString(hostSet)); splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0]))); tmp = 0; length = 0; } } else if (tmp == avg && rest > 0) { pathList.add(file.getPath()); length = length + file.getLen(); FileSystem fs = file.getPath().getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen()); for (BlockLocation blockLocation : blkLocations) { for (String host : blockLocation.getHosts()) { hostSet.add(host); } } rest--; LOG.info("Split on host: " + getHostsString(hostSet)); splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0]))); tmp = 0; length = 0; } } // Save the number of input files in the job-conf job.getConfiguration().setLong(NUM_INPUT_FILES, numMaps); LOG.info("Total # of splits: " + splits.size()); return splits; }
From source file:edu.iu.fileformat.MultiFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { // Generate splits List<InputSplit> splits = new ArrayList<>(); List<FileStatus> files = listStatus(job); org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration(); int numMaps = jobConf.getNumMapTasks(); LOG.info("NUMBER OF FILES: " + files.size()); LOG.info("NUMBER OF MAPS: " + numMaps); // randomizeFileListOrder(files); int avg = files.size() / numMaps; int rest = files.size() % numMaps; int tmp = 0;/*from w w w . jav a 2 s . co m*/ long length = 0; List<Path> pathList = null; Set<String> hostSet = null; for (FileStatus file : files) { if (tmp == 0) { pathList = new ArrayList<>(); hostSet = new HashSet<>(); } if (tmp < avg) { pathList.add(file.getPath()); length = length + file.getLen(); FileSystem fs = file.getPath().getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen()); for (BlockLocation blockLocation : blkLocations) { for (String host : blockLocation.getHosts()) { hostSet.add(host); } } tmp++; if (tmp == avg && rest == 0) { LOG.info("Split on host: " + getHostsString(hostSet)); splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0]))); tmp = 0; length = 0; } } else if (tmp == avg && rest > 0) { pathList.add(file.getPath()); length = length + file.getLen(); FileSystem fs = file.getPath().getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen()); for (BlockLocation blockLocation : blkLocations) { for (String host : blockLocation.getHosts()) { hostSet.add(host); } } rest--; LOG.info("Split on host: " + getHostsString(hostSet)); splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0]))); tmp = 0; length = 0; } } // Save the number of input files in the // job-conf job.getConfiguration().setLong(NUM_INPUT_FILES, numMaps); LOG.info("Total # of splits: " + splits.size()); return splits; }
From source file:edu.uci.ics.asterix.external.adapter.factory.HDFSAdapterFactory.java
License:Apache License
/** * Instead of creating the split using the input format, we do it manually * This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions * and the produced splits only cover intersection between current files in hdfs and files stored internally * in AsterixDB/*from w ww .ja va 2 s . c om*/ * 1. NoOp means appended file * 2. AddOp means new file * 3. UpdateOp means the delta of a file * * @return * @throws IOException */ protected InputSplit[] getSplits(JobConf conf) throws IOException { // Create file system object FileSystem fs = FileSystem.get(conf); ArrayList<FileSplit> fileSplits = new ArrayList<FileSplit>(); ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<ExternalFile>(); // Create files splits for (ExternalFile file : files) { Path filePath = new Path(file.getFileName()); FileStatus fileStatus; try { fileStatus = fs.getFileStatus(filePath); } catch (FileNotFoundException e) { // file was deleted at some point, skip to next file continue; } if (file.getPendingOp() == ExternalFilePendingOp.PENDING_ADD_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) { // Get its information from HDFS name node BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize()); // Create a split per block for (BlockLocation block : fileBlocks) { if (block.getOffset() < file.getSize()) { fileSplits.add(new FileSplit(filePath, block.getOffset(), (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength() : (file.getSize() - block.getOffset()), block.getHosts())); orderedExternalFiles.add(file); } } } else if (file.getPendingOp() == ExternalFilePendingOp.PENDING_NO_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) { long oldSize = 0L; long newSize = file.getSize(); for (int i = 0; i < files.size(); i++) { if (files.get(i).getFileName() == file.getFileName() && files.get(i).getSize() != file.getSize()) { newSize = files.get(i).getSize(); oldSize = file.getSize(); break; } } // Get its information from HDFS name node BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize); // Create a split per block for (BlockLocation block : fileBlocks) { if (block.getOffset() + block.getLength() > oldSize) { if (block.getOffset() < newSize) { // Block interact with delta -> Create a split long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset(); long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L : block.getOffset() + block.getLength() - newSize; long splitLength = block.getLength() - startCut - endCut; fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength, block.getHosts())); orderedExternalFiles.add(file); } } } } } fs.close(); files = orderedExternalFiles; return fileSplits.toArray(new FileSplit[fileSplits.size()]); }
From source file:edu.ucsb.cs.hadoop.CustomFileInputFormat.java
License:Apache License
/** * Splits files returned by {@link #listStatus(JobConf)} when they're too * big.// w w w. jav a2 s.c om */ @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { FileStatus[] files = listStatus(job); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); NetworkTopology clusterMap = new NetworkTopology(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(new FileSplit(path, 0, length, splitHosts)); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:edu.umn.cs.spatialHadoop.mapred.CombinedSpatialInputFormat.java
License:Apache License
public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); FileSystem fs = path.getFileSystem(job); FileStatus file = fs.getFileStatus(path); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (length != 0) { long blockSize = file.getBlockSize(); long splitSize = blockSize; long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; }/*from w w w.j a v a 2s . c o m*/ if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(new FileSplit(path, 0, length, splitHosts)); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java
License:Open Source License
/** * Spatially joins two files.// w w w .java 2 s. c o m * @param inputFiles * @param userOutputPath * @param params * @return * @throws IOException * @throws InterruptedException */ @SuppressWarnings("unchecked") public static long distributedJoinSmart(final Path[] inputFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { Path[] originalInputFiles = inputFiles.clone(); FileSystem outFs = inputFiles[0].getFileSystem(params); Path outputPath = userOutputPath; if (outputPath == null) { do { outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } // Decide whether to do a repartition step or not int cost_with_repartition, cost_without_repartition; final FileStatus[] fStatus = new FileStatus[inputFiles.length]; for (int i_file = 0; i_file < inputFiles.length; i_file++) { // TODO work with folders. Calculate size more accurately FileSystem fs = inputFiles[i_file].getFileSystem(params); fStatus[i_file] = fs.getFileStatus(inputFiles[i_file]); } // Sort files by length (size) IndexedSortable filesBySize = new IndexedSortable() { @Override public void swap(int i, int j) { Path tmp1 = inputFiles[i]; inputFiles[i] = inputFiles[j]; inputFiles[j] = tmp1; FileStatus tmp2 = fStatus[i]; fStatus[i] = fStatus[j]; fStatus[j] = tmp2; } @Override public int compare(int i, int j) { if (fStatus[i].getLen() < fStatus[j].getLen()) return 0; return fStatus[i].getLen() < fStatus[j].getLen() ? -1 : 1; } }; new QuickSort().sort(filesBySize, 0, inputFiles.length); GlobalIndex<Partition>[] gIndexes = new GlobalIndex[fStatus.length]; int[] numBlocks = new int[fStatus.length]; for (int i_file = 0; i_file < fStatus.length; i_file++) { gIndexes[i_file] = SpatialSite.getGlobalIndex(outFs, fStatus[i_file].getPath()); if (gIndexes[i_file] != null) { // Number of blocks is equal to number of partitions in global // index numBlocks[i_file] = gIndexes[i_file].size(); } else if (fStatus[i_file].isDir()) { // Add up number of file system blocks in all subfiles of this // directory numBlocks[i_file] = 0; FileStatus[] subfiles = outFs.listStatus(inputFiles[i_file], SpatialSite.NonHiddenFileFilter); for (FileStatus subfile : subfiles) { numBlocks[i_file] += outFs.getFileBlockLocations(subfile, 0, subfile.getLen()).length; } } else { // Number of file system blocks in input file numBlocks[i_file] = outFs.getFileBlockLocations(fStatus[i_file], 0, fStatus[i_file].getLen()).length; } } cost_without_repartition = gIndexes[0] != null && gIndexes[1] != null ? GlobalIndex.spatialJoin(gIndexes[0], gIndexes[1], null) : (numBlocks[0] * numBlocks[1]); // Total cost = Cost of repartition (=== 2 * numBlocks[0]) + // cost of join (=== numBlocks[0] + numBlocks[1]) cost_with_repartition = numBlocks[0] * 3 + numBlocks[1]; LOG.info("Cost with repartition is estimated to " + cost_with_repartition); LOG.info("Cost without repartition is estimated to " + cost_without_repartition); boolean need_repartition = cost_with_repartition < cost_without_repartition; if (need_repartition) { int file_to_repartition = selectRepartition(inputFiles, params); repartitionStep(inputFiles, file_to_repartition, params); } // Restore inputFiles to the original order by user if (inputFiles[1] != originalInputFiles[1]) { Path temp = inputFiles[0]; inputFiles[0] = inputFiles[1]; inputFiles[1] = temp; } // Redistribute join the larger file and the partitioned file long result_size = DistributedJoin.joinStep(inputFiles, outputPath, params); if (userOutputPath == null) outFs.delete(outputPath, true); return result_size; }
From source file:edu.umn.cs.spatialHadoop.ReadFile.java
License:Open Source License
public static void main(String[] args) throws Exception { OperationsParams cla = new OperationsParams(new GenericOptionsParser(args)); Path input = cla.getPath();/*from www .j a v a2s . c o m*/ if (input == null) { printUsage(); throw new RuntimeException("Illegal parameters"); } Configuration conf = new Configuration(); Path inFile = new Path(args[0]); FileSystem fs = inFile.getFileSystem(conf); long length = fs.getFileStatus(inFile).getLen(); GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile); if (gindex == null) { BlockLocation[] locations = cla.getInt("offset", 0) == -1 ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length) : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getInt("offset", 0), 1); System.out.println(locations.length + " heap blocks"); } else { for (Partition p : gindex) { long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen(); System.out.println(p + " --- " + partition_length); } } }