List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:org.terrier.structures.indexing.singlepass.hadoop.BitPostingIndexInputFormat.java
License:Mozilla Public License
/** * {@inheritDoc} /*from w ww .ja v a 2 s . com*/ */ @SuppressWarnings("unchecked") /** Make the splits of the index structure. Bit structures split across multiple files are supported */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { HadoopUtility.loadTerrierJob(job); final String lookupStructureName = job.get(BITPOSTING_LOOKUP_STRUCTURE_KEY); final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY); Index.setIndexLoadingProfileAsRetrieval(false); final IndexOnDisk index = HadoopUtility.fromHConfiguration(job); final byte fileCount = Byte .parseByte(index.getIndexProperty("index." + bitPostingStructureName + ".data-files", "1")); final Path bitPostingStructureFiles[] = new Path[fileCount]; final FileStatus[] fss = new FileStatus[fileCount]; final long[] bitPostingStructureFSBlockSizes = new long[fileCount]; logger.info("Calculating splits of structure " + bitPostingStructureName); FileSystem fs = FileSystem.get(job); for (byte i = 0; i < fileCount; i++) { bitPostingStructureFiles[i] = new Path( BitPostingIndexInputStream.getFilename(index, bitPostingStructureName, fileCount, i)); fss[i] = fs.getFileStatus(bitPostingStructureFiles[i]); bitPostingStructureFSBlockSizes[i] = getBlockSize(bitPostingStructureFiles[i], fss[i]); logger.info("File " + i + " approx splits=" + ((double) fss[i].getLen() / (double) bitPostingStructureFSBlockSizes[i])); } //this smells of a hack, because we dont have a strategy for naming various index structures streams final Iterator<? extends BitIndexPointer> offsetIterator = index .hasIndexStructureInputStream(lookupStructureName + "-entry") ? (Iterator<? extends BitIndexPointer>) index .getIndexStructureInputStream(lookupStructureName + "-entry") : (Iterator<? extends BitIndexPointer>) index .getIndexStructureInputStream(lookupStructureName); if (offsetIterator == null) throw new IOException("No such stream structure called " + lookupStructureName + "-entry or " + lookupStructureName + " found in index"); final List<InputSplit> splitList = new ArrayList<InputSplit>(); int currentId = 0; //size of the current split of each file final long[] blockSize = new long[fileCount]; //location of the last split for each file final long[] bitPostingStructureSplitEndOffsets = new long[fileCount]; //how many entries will be in this split, for each file final int[] entriesInBlock = new int[fileCount]; //what is the starting id of the next entry split, for each file final int[] firstEntryOfNextSplit = new int[fileCount]; //number of splits per file, for logging only final int[] splitsPerFile = new int[fileCount]; Arrays.fill(firstEntryOfNextSplit, Integer.MAX_VALUE); BitIndexPointer currentPointer = null; //iterate through the lookup iterator //split the target bit posting index structure into chunks of size bitPostingStructureFSBlockSize while (offsetIterator.hasNext()) { //ok, where is the next pointer to currentPointer = offsetIterator.next(); final byte fileId = currentPointer.getFileNumber(); //what is the first entry of the next split of this file? firstEntryOfNextSplit[fileId] = Math.min(currentId, firstEntryOfNextSplit[fileId]); //this split will have one more entry entriesInBlock[fileId]++; //what is our current offset? long offset = currentPointer.getOffset(); //System.err.println("Offset" + offset); //if we made the split here, how big would it be? blockSize[fileId] = offset - bitPostingStructureSplitEndOffsets[fileId]; //is this block is large enough if (blockSize[fileId] > bitPostingStructureFSBlockSizes[fileId]) { //yes, its big enough //block will be from bitPostingStructureSplitEndOffsets[fileId] to offset, which is blockSize[fileId] BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId], bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]); splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path bitPostingStructureSplitEndOffsets[fileId], //start blockSize[fileId], //length blkLocations[0].getHosts(), //hosts firstEntryOfNextSplit[fileId], //first entry in this split entriesInBlock[fileId]) //number of entries in this split ); logger.info("File " + fileId + " split " + (splitList.size() - 1) + " " + splitList.get(splitList.size() - 1).toString()); //record another split for this file (for logging only) splitsPerFile[fileId]++; //update recording of last offset for this file bitPostingStructureSplitEndOffsets[fileId] = offset; //reset size of split for this file blockSize[fileId] = 0; //reset counter of entries in split of this file entriesInBlock[fileId] = 0; //reset the first offset of this split firstEntryOfNextSplit[fileId] = Integer.MAX_VALUE; } //ids always increment currentId++; } IndexUtil.close(offsetIterator); //find any files which have trailing blocks for (byte fileId = 0; fileId < fileCount; fileId++) { if (entriesInBlock[fileId] == 0) continue; assert (firstEntryOfNextSplit[fileId] != Integer.MAX_VALUE); //block will be from bitPostingStructureSplitEndOffsets[fileId], with length blockSize[fileId] BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId], bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]); splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path of file for split bitPostingStructureSplitEndOffsets[fileId], //start offset of this split blockSize[fileId], //size of this split blkLocations[0].getHosts(), //hosts for this split firstEntryOfNextSplit[fileId], //first entry id for this split entriesInBlock[fileId]) //number of entries in this split ); logger.info("File " + fileId + " trailing split " + (splitList.size() - 1) + " " + splitList.get(splitList.size() - 1).toString()); //record another split for this file (for logging only) splitsPerFile[fileId]++; } logger.info("Split " + bitPostingStructureName + " (of " + currentId + " entries) into " + splitList.size() + " splits"); if (fileCount > 1) { logger.info("Multiple files of " + bitPostingStructureName + " were split as follows: " + ArrayUtils.join(splitsPerFile, ",")); } assert (splitList.size() > 0); index.close(); return splitList.toArray(new InputSplit[splitList.size()]); }
From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java
License:Mozilla Public License
@SuppressWarnings("unchecked") @Override/*from w w w .j a va2 s . co m*/ /** * Splits the input collection into * sets of files where each Map task * gets about the same number of files */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path[] paths = FileInputFormat.getInputPaths(job); // HADOOP-1818: Manage splits only if there are paths if (paths.length == 0) { return new InputSplit[0]; } if (numSplits > paths.length) { numSplits = paths.length; } else if (numSplits < 1) { numSplits = 1; } logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks"); List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>( numSplits); final int numPaths = paths.length; long[] lengths = new long[numPaths]; TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array .newInstance(TObjectLongHashMap.class, numPaths); final FileSystem fs = FileSystem.get(job); for (int i = 0; i < paths.length; i++) { final FileStatus fss = fs.getFileStatus(paths[i]); lengths[i] = fss.getLen(); final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>(); final long normalblocksize = fss.getBlockSize(); for (long offset = 0; offset < lengths[i]; offset += normalblocksize) { final long blocksize = Math.min(offset + normalblocksize, lengths[i]); final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize); for (BlockLocation bl : blockLocations) { for (String host : bl.getHosts()) { location2size.adjustOrPutValue(host, blocksize, blocksize); } } } } //we need to over-estimate using ceil, to ensure that the last split is not /too/ big final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits); int pathsUsed = 0; int splitnum = 0; CombineFileSplit mfs; // for each split except the last one (which may be smaller than numberOfFilesPerSplit) while (pathsUsed < numPaths) { /* caclulate split size for this task - usually numberOfFilesPerSplit, but * less than this for the last split */ final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed : numberOfFilesPerSplit; //arrays of information for split Path[] splitPaths = new Path[splitSizeForThisSplit]; long[] splitLengths = new long[splitSizeForThisSplit]; long[] splitStarts = new long[splitSizeForThisSplit]; final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>(); String[] splitLocations = null; //final recommended locations for this split. for (int i = 0; i < splitSizeForThisSplit; i++) { locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() { public boolean execute(String a, long b) { allLocationsForSplit.adjustOrPutValue(a, b, b); return true; } }); if (allLocationsForSplit.size() <= 3) { splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); } else { String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); Arrays.sort(hosts, new Comparator<String>() { public int compare(String o1, String o2) { long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2); if (diffamount > 0) { return -1; } else if (diffamount < 0) { return 1; } return 0; } }); splitLocations = new String[3]; System.arraycopy(hosts, 0, splitLocations, 0, 3); } } //copy information for this split System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit); System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit); //count the number of paths consumed pathsUsed += splitSizeForThisSplit; //make the actual split object //logger.info("New split of size " + splitSizeForThisSplit); mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations); splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum)); splitnum++; } if (!(pathsUsed == paths.length)) { throw new IOException("Number of used paths does not equal total available paths!"); } return splits.toArray(new PositionAwareSplit[splits.size()]); }
From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath();/*from ww w .j av a2 s. c o m*/ long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); SplitCompressionInputStream is = in.getSplitCompressionInputStream(); long start = 0; long skip = 0; if (is != null) { start = is.getAdjustedStart(); length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); SplitCompressionInputStream cin = in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } return splits; }
From source file:parquet.hadoop.ParquetInputFormat.java
License:Apache License
/** * @param configuration the configuration to connect to the file system * @param footers the footers of the files to read * @return the splits for the footers//from ww w.jav a2 s . com * @throws IOException */ public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException { final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE); final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L)); if (maxSplitSize < 0 || minSplitSize < 0) { throw new ParquetDecodingException("maxSplitSize or minSplitSie should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize); } List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, configuration.getBoolean(STRICT_TYPE_CHECKING, true)); ReadContext readContext = getReadSupport(configuration).init( new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema())); for (Footer footer : footers) { final Path file = footer.getFile(); LOG.debug(file); FileSystem fs = file.getFileSystem(configuration); FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks(); BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); splits.addAll(generateSplits(blocks, fileBlockLocations, fileStatus, parquetMetaData.getFileMetaData(), readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize, maxSplitSize)); } return splits; }
From source file:StorageEngineClient.ColumnStorageInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path tmpPath = null;/* w ww. j ava2s . co m*/ List<FormatStorageInputSplit> splits = new ArrayList<FormatStorageInputSplit>(); HashMap<String, FileStatus> files = new HashMap<String, FileStatus>(); for (FileStatus file : listStatus(job)) { String filestr = file.getPath().toString(); String filekey = filestr.substring(0, filestr.lastIndexOf("_idx")); if (!files.containsKey(filekey)) { files.put(filekey, file); } else { if (file.getLen() > files.get(filekey).getLen()) { files.put(filekey, file); } } } for (String filekey : files.keySet()) { FileStatus file = files.get(filekey); Path path = file.getPath(); Path keypath = new Path(filekey); tmpPath = keypath; FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (blkLocations.length == 0) { continue; } if (blkLocations.length == 1) { FormatStorageInputSplit split = new FormatStorageInputSplit(keypath, length, blkLocations[0].getHosts()); splits.add(split); } else { String filename = path.toString(); IFormatDataFile ifd = new IFormatDataFile(job); ifd.open(filename); ISegmentIndex segmentIndex = ifd.segIndex(); for (int i = 0; i < segmentIndex.getSegnum(); i++) { FormatStorageInputSplit split = new FormatStorageInputSplit(keypath, segmentIndex.getseglen(i), segmentIndex.getILineIndex(i).beginline(), segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1, blkLocations[i].getHosts()); splits.add(split); } ifd.close(); } } if (splits.size() == 0) { splits.add(new FormatStorageInputSplit(tmpPath, 0, 0, 0, new String[0])); return splits.toArray(new FormatStorageInputSplit[splits.size()]); } System.out.println("Total # of splits: " + splits.size()); return splits.toArray(new FormatStorageInputSplit[splits.size()]); }
From source file:StorageEngineClient.CombineFileInputFormat.java
License:Open Source License
private void getMoreSplitsWithStatus(JobConf job, Path[] paths1, Map<String, FileStatus> fileNameToStatus, long maxSize, long minSizeNode, long minSizeRack, List<CombineFileSplit> splits) throws IOException, NullGzFileException { if (paths1.length == 0) { return;/*from w ww. j a va 2 s.co m*/ } Path[] paths = paths1; ArrayList<Path> splitable = new ArrayList<Path>(); ArrayList<Path> unsplitable = new ArrayList<Path>(); for (int i = 0; i < paths1.length; i++) { if (isSplitable(paths1[i].getFileSystem(job), paths1[i])) { splitable.add(paths1[i]); } else { unsplitable.add(paths1[i]); } } if (unsplitable.size() != 0) { paths = new Path[splitable.size()]; splitable.toArray(paths); } OneFileInfo[] files; HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>(); HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>(); HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>(); files = new OneFileInfo[paths.length]; long totLength = 0; for (int i = 0; i < paths.length; i++) { files[i] = new OneFileInfo(paths[i], fileNameToStatus.get(paths[i].toString()), job, rackToBlocks, blockToNodes, nodeToBlocks); totLength += files[i].getLength(); } for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter .hasNext();) { Map.Entry<String, List<OneBlockInfo>> onenode = iter.next(); this.processsplit(job, onenode, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "node"); } for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter .hasNext();) { Map.Entry<String, List<OneBlockInfo>> onerack = iter.next(); this.processsplit(job, onerack, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "rack"); } this.processsplit(job, null, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "all"); int maxFileNumPerSplit = job.getInt("hive.merge.inputfiles.maxFileNumPerSplit", 1000); HashSet<OneBlockInfo> hs = new HashSet<OneBlockInfo>(); while (blockToNodes.size() > 0) { ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>(); List<String> nodes = new ArrayList<String>(); int filenum = 0; hs.clear(); for (OneBlockInfo blockInfo : blockToNodes.keySet()) { validBlocks.add(blockInfo); filenum++; for (String host : blockInfo.hosts) { nodes.add(host); } hs.add(blockInfo); if (filenum >= maxFileNumPerSplit) { break; } } for (OneBlockInfo blockInfo : hs) { blockToNodes.remove(blockInfo); } this.addCreatedSplit(job, splits, nodes, validBlocks); } if (unsplitable.size() != 0) { HashMap<OneBlockInfo, String[]> fileToNodes = new HashMap<OneBlockInfo, String[]>(); for (Path path : unsplitable) { FileSystem fs = path.getFileSystem(job); FileStatus stat = fileNameToStatus.get(path.toString());//fs.getFileStatus(path); long len = stat.getLen(); BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, len); if (locations.length == 0) { console.printError("The file " + path.toUri().toString() + " maybe is empty, please check it!"); throw new NullGzFileException( "The file " + path.toUri().toString() + " maybe is empty, please check it!"); } LOG.info("unsplitable file:" + path.toUri().toString() + " length:" + len); OneBlockInfo oneblock = new OneBlockInfo(path, 0, len, locations[0].getHosts(), locations[0].getTopologyPaths()); fileToNodes.put(oneblock, locations[0].getHosts()); } this.processsplitForUnsplit(job, null, fileToNodes, maxSize, minSizeNode, minSizeRack, splits, "all"); } }
From source file:StorageEngineClient.FormatStorageInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path tmpPath = null;// w w w . j ava 2 s. co m List<FormatStorageInputSplit> splits = new ArrayList<FormatStorageInputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); tmpPath = path; FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (blkLocations.length == 0) { continue; } else if (blkLocations.length == 1) { FormatStorageInputSplit split = new FormatStorageInputSplit(path, length, blkLocations[0].getHosts()); splits.add(split); } else { String filename = path.toString(); IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(filename); ISegmentIndex segmentIndex = ifdf.segIndex(); for (int i = 0; i < segmentIndex.getSegnum(); i++) { blkLocations = fs.getFileBlockLocations(file, segmentIndex.getSegOffset(i), segmentIndex.getseglen(i)); FormatStorageInputSplit split = new FormatStorageInputSplit(path, segmentIndex.getseglen(i), segmentIndex.getILineIndex(i).beginline(), segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1, blkLocations[0].getHosts()); splits.add(split); } ifdf.close(); } } if (splits.size() == 0) { splits.add(new FormatStorageInputSplit(tmpPath, 0, 0, 0, new String[0])); } LOG.info("Total # of splits: " + splits.size()); return splits.toArray(new FormatStorageInputSplit[splits.size()]); }
From source file:StorageEngineClient.HashMultiFileColumnStorageInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { HashMap<String, FileStatus> files = new HashMap<String, FileStatus>(); for (FileStatus file : listStatus(job)) { String filestr = file.getPath().toString(); String filekey = filestr.substring(0, filestr.lastIndexOf("_idx")); if (!files.containsKey(filekey)) { files.put(filekey, file);/*from w ww .j a v a2 s . c om*/ } else { if (file.getLen() > files.get(filekey).getLen()) { files.put(filekey, file); } } } List<InputSplit> splits = new ArrayList<InputSplit>(numSplits); List<Path> paths = new ArrayList<Path>(); int count = 0; BlockLocation[] blkLocations = null; for (String filekey : files.keySet()) { FileStatus file = files.get(filekey); Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); if (count == 0) { blkLocations = fs.getFileBlockLocations(file, 0, length); count++; } paths.add(new Path(filekey)); } if (paths.size() == 0 || blkLocations == null) { splits.add(new MultiFormatStorageSplit(new Path[0], new String[0])); return splits.toArray(new MultiFormatStorageSplit[splits.size()]); } int blkIndex = getBlockIndex(blkLocations, 0); MultiFormatStorageSplit split = new MultiFormatStorageSplit(paths.toArray(new Path[paths.size()]), blkLocations[blkIndex].getHosts()); splits.add(split); LOG.info("Total # of splits: " + splits.size()); return splits.toArray(new MultiFormatStorageSplit[splits.size()]); }
From source file:StorageEngineClient.HashMultiFileFormatStorageInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(numSplits); List<Path> paths = new ArrayList<Path>(10); int count = 0; BlockLocation[] blkLocations = null; for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); if (count == 0) { blkLocations = fs.getFileBlockLocations(file, 0, length); count++;//w w w . jav a 2s .c o m } paths.add(path); } if (paths.size() == 0 || blkLocations == null) { splits.add(new MultiFormatStorageSplit(new Path[0], new String[0])); return splits.toArray(new MultiFormatStorageSplit[splits.size()]); } int blkIndex = getBlockIndex(blkLocations, 0); MultiFormatStorageSplit split = new MultiFormatStorageSplit(paths.toArray(new Path[paths.size()]), blkLocations[blkIndex].getHosts()); splits.add(split); LOG.info("Total # of splits: " + splits.size()); return splits.toArray(new MultiFormatStorageSplit[splits.size()]); }
From source file:wiki.hadoop.mapred.lib.input.StreamWikiDumpInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath();/*from w w w . j a v a2s .c o m*/ long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); SplitCompressionInputStream is = in.getSplitCompressionInputStream(); long start = 0; long skip = 0; if (is != null) { start = is.getAdjustedStart(); length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); SplitCompressionInputStream cin = in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in, split.getStart()); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } return splits; }