List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(final JobContext job) throws IOException { final Configuration configuration = job.getConfiguration(); final List<InputSplit> result = Lists.newArrayList(); final List<FileStatus> files = listStatus(job); LOG.debug("Initial file list: {} {}", files.size(), files); for (final FileStatus fileStatus : files) { final Path dataFile = fileStatus.getPath(); final FileSystem fileSystem = dataFile.getFileSystem(configuration); final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());//w ww . ja v a 2 s . c om // Data file, try to split if the .index file was found final SSTableIndexIndex index = indexes.get(dataFile); if (index == null) { throw new IOException("Index not found for " + dataFile); } for (final SSTableIndexIndex.Chunk chunk : index.getOffsets()) { // This isn't likely to work well because we are dealing with the index into uncompressed data... final int blockIndex = getBlockIndex(blockLocations, chunk.getStart() / COMPRESSION_RATIO_ASSUMPTION); final SSTableSplit split = new SSTableSplit(dataFile, chunk.getStart(), chunk.getEnd(), chunk.getEnd() - chunk.getStart(), blockLocations[blockIndex].getHosts()); result.add(split); } } LOG.debug("Splits calculated: {} {}", result.size(), result); return result; }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.HoplogUtil.java
License:Apache License
/** * Creates a mapping of hoplog to hdfs blocks on disk * /* ww w .ja va 2 s. co m*/ * @param files * list of hoplog file status objects * @return array of hdfs block location objects associated with a hoplog * @throws IOException */ public static Map<FileStatus, BlockLocation[]> getBlocks(Configuration config, Collection<FileStatus> files) throws IOException { Map<FileStatus, BlockLocation[]> blocks = new HashMap<FileStatus, BlockLocation[]>(); if (files == null || files.isEmpty()) { return blocks; } FileSystem fs = files.iterator().next().getPath().getFileSystem(config); for (FileStatus hoplog : files) { long length = hoplog.getLen(); BlockLocation[] fileBlocks = fs.getFileBlockLocations(hoplog, 0, length); blocks.put(hoplog, fileBlocks); } return blocks; }
From source file:com.google.mr4c.sources.URIDataFileSource.java
License:Open Source License
@Override public BlockLocation[] getBlockLocation() throws IOException { URI uri = ContentFactories.scrubURI(m_uri); FileSystem fs = FileSystem.get(uri, s_config); Path path = new Path(uri); FileStatus status = fs.getFileStatus(path); return fs.getFileBlockLocations(status, 0, status.getBlockSize()); }
From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java
License:Apache License
protected BlockLocation[] getFileBlockLocations(FileSystem fs, FileStatus stat) throws IOException { return fs.getFileBlockLocations(stat, 0, stat.getLen()); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java
License:Open Source License
/** * Get the list of hostnames where the input split is located. *//*from ww w . j a va 2s . c o m*/ @Override public String[] getLocations() throws IOException { //Timing time = new Timing(); //time.start(); JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); //read task string LongWritable key = new LongWritable(); Text value = new Text(); RecordReader<LongWritable, Text> reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL); reader.next(key, value); reader.close(); //parse task Task t = Task.parseCompactString(value.toString()); //get all locations HashMap<String, Integer> hosts = new HashMap<String, Integer>(); if (t.getType() == TaskType.SET) { for (IntObject val : t.getIterations()) { String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } } else //TaskType.RANGE { //since this is a serial process, we use just the first iteration //as a heuristic for location information long lFrom = t.getIterations().get(0).getLongValue(); long lTo = t.getIterations().get(1).getLongValue(); for (long li : new long[] { lFrom, lTo }) { String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } /* int lFrom = t.getIterations().get(0).getIntValue(); int lTo = t.getIterations().get(1).getIntValue(); int lIncr = t.getIterations().get(2).getIntValue(); for( int i=lFrom; i<=lTo; i+=lIncr ) { String fname = _fname+"/"+String.valueOf( ((i-_offset)/_blen+_offset) ); FileSystem fs = FileSystem.get(job); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for( BlockLocation bl : tmp1 ) countHosts(hosts, bl.getHosts()); }*/ } //System.out.println("Get locations "+time.stop()+""); //majority consensus on top host return getTopHosts(hosts); }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java
License:Apache License
protected BlockLocation[] getFileBlockLocations(FileSystem fs, FileStatus stat) throws IOException { if (stat instanceof LocatedFileStatus) { return ((LocatedFileStatus) stat).getBlockLocations(); }/*from w w w . j av a2 s . c om*/ return fs.getFileBlockLocations(stat, 0, stat.getLen()); }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context/*from w w w . j av a 2 s .c om*/ * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { Stopwatch sw = Stopwatch.createStarted(); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { FileSystem fs = path.getFileSystem(job.getConfiguration()); blkLocations = fs.getFileBlockLocations(file, 0, length); } if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.elapsed(TimeUnit.MILLISECONDS)); } return splits; }
From source file:com.linkedin.cubert.io.rubix.RubixInputSplit.java
License:Open Source License
@Override public String[] getLocations() throws IOException, InterruptedException { if (hostnames == null) { /* Obtain the FileSystem object and get the FileStatus objects for the split */ FileSystem fileSystem = FileSystem.get(conf); FileStatus fileStatus = fileSystem.getFileStatus(filename); /*/*from w w w. ja va2 s. c o m*/ * Obtain the Block locations for the split. This also provides the offset and * length information for each block */ final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, offset, length); /** * Collect all hosts in a map and populate the number of bytes to be read from * each host */ Long l; Map<String, Long> hostMap = new HashMap<String, Long>(); for (BlockLocation bl : blockLocations) { final long start = bl.getOffset() < offset ? offset : bl.getOffset(); final long end = (offset + length) < (bl.getOffset() + bl.getLength()) ? offset + length : bl.getOffset() + bl.getLength(); final long nRelevantBytes = end - start; for (String host : bl.getHosts()) { hostMap.put(host, ((l = hostMap.get(host)) == null ? 0 : l) + nRelevantBytes); } } /* Sort them in decreasing order of maximum number of relevant bytes */ final Set<Map.Entry<String, Long>> entries = hostMap.entrySet(); final Map.Entry<String, Long>[] hostLengthPairs = entries.toArray(new Map.Entry[entries.size()]); Arrays.sort(hostLengthPairs, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Map.Entry<String, Long> e1, Map.Entry<String, Long> e2) { return (int) (e2.getValue() - e1.getValue()); } }); /* Populate the hostnames object */ final int nHost = Math.min(hostLengthPairs.length, MAX_LOCATIONS); hostnames = new String[nHost]; for (int i = 0; i < nHost; ++i) { hostnames[i] = hostLengthPairs[i].getKey(); } } return hostnames; }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); Configuration conf = job.getConfiguration(); try {/* w ww. j ava2 s . c o m*/ List<FileStatus> files = listStatus(job); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); for (FileStatus child : files) { Path path = child.getPath(); FileSystem fs = path.getFileSystem(conf); // length is 0 for dir according to FSDirectory.java in 0.20 // however, w/ Hadoop2, dir in local fs has non-zero length long length = child.getLen(); BlockLocation[] blkLocations = null; if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) { blkLocations = fs.getFileBlockLocations(child, 0, length); } else if (length != 0) { throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString()); } if ((length != 0) && isSplitable(job, path)) { long blockSize = child.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } } catch (InvalidInputException ex) { String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY); String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*"); throw new IOException("No input files found with the specified input path " + inPath + " and input file pattern " + pattern, ex); } PathFilter jobFilter = getInputPathFilter(job); List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); // take a second pass of the splits generated to extract files from // directories int count = 0; // flatten directories until reaching SPLIT_COUNT_LIMIT while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) { FileSplit split = (FileSplit) splits.get(count); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); FileStatus status = fs.getFileStatus(file); if (status.isDirectory()) { FileStatus[] children = fs.listStatus(file, inputFilter); if (children.length + count < SPLIT_COUNT_LIMIT) { splits.remove(count); for (FileStatus stat : children) { FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null); splits.add(child); } } else { count++; } } else { count++; } } return splits; }
From source file:com.marklogic.mapreduce.ForestInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { // stand directories Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); FileStatus children[] = fs.listStatus(path); FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null, timestampsStatus = null; boolean obsolete = false; for (FileStatus child : children) { String fileName = child.getPath().getName(); if (fileName.equals("TreeData")) { // inside a stand treeDataStatus = child;/* w ww . ja v a2s. c o m*/ } else if (fileName.equals("TreeIndex")) { treeIndexStatus = child; } else if (fileName.equals("Ordinals")) { ordinalsStatus = child; } else if (fileName.equals("Timestamps")) { timestampsStatus = child; } else if (fileName.equals("Obsolete")) { obsolete = true; break; } } if (obsolete) { LOG.warn("Obsolete file found. The forest is either live or isn't " + "dismounted cleanly. Ignoring stand " + path); break; } if (treeDataStatus == null) { throw new RuntimeException("TreeData file not found."); } else if (treeIndexStatus == null) { throw new RuntimeException("TreeIndex file not found."); } else if (ordinalsStatus == null) { throw new RuntimeException("Ordinals file not found."); } else if (timestampsStatus == null) { throw new RuntimeException("Timestamps file not found."); } long treeDataSize = treeDataStatus.getLen(); if (treeDataSize == 0) { // unexpected, give up this stand LOG.warn("Found empty TreeData file. Skipping..."); continue; // skipping this stand } Path treeDataPath = treeDataStatus.getPath(); long blockSize = treeDataStatus.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); // make splits based on TreeIndex FSDataInputStream is = fs.open(treeIndexStatus.getPath()); BiendianDataInputStream in = new BiendianDataInputStream(is); int prevDocid = -1, docid = -1, position = 0; long prevOffset = -1L, offset = 0, splitStart = 0; BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize); try { for (;; ++position) { try { docid = in.readInt(); in.readInt(); offset = in.readLong(); } catch (EOFException e) { break; } int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize); if (comp > 0) { throw new RuntimeException("TreeIndex offset is out of bound: position = " + position + ", offset = " + offset + ", treeDataSize = " + treeDataSize); } if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) { throw new RuntimeException("docid out of order, position = " + position + ", docid = " + docid + ", prevDocid = " + prevDocid); } prevDocid = docid; if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) { throw new RuntimeException("offset out of order, position = " + position + ", offset = " + offset + ", prevOffset = " + prevOffset); } long splitLen = offset - splitStart; if (splitLen == splitSize || (splitLen > splitSize && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) { int blkIndex = getBlockIndex(blkLocations, offset); InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid=" + docid); } splits.add(split); splitStart = offset; } else if (splitLen > splitSize) { int blkIndex = getBlockIndex(blkLocations, prevOffset); InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart) + " last docid=" + docid); } splits.add(split); splitStart = prevOffset; } } } finally { in.close(); } if (offset > splitStart) { int blkIndex = getBlockIndex(blkLocations, offset - 1); InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart) + " last docid=" + docid); } splits.add(split); } } if (LOG.isDebugEnabled()) { LOG.debug("Made " + splits.size() + " splits."); } return splits; }