List of usage examples for org.apache.hadoop.fs BlockLocation getLength
public long getLength()
From source file:com.facebook.presto.hive.BackgroundHiveSplitLoader.java
License:Apache License
private List<HiveSplit> createHiveSplits(String partitionName, String path, BlockLocation[] blockLocations, long start, long length, Properties schema, List<HivePartitionKey> partitionKeys, boolean splittable, ConnectorSession session, OptionalInt bucketNumber, TupleDomain<HiveColumnHandle> effectivePredicate, Map<Integer, HiveType> columnCoercions) throws IOException { ImmutableList.Builder<HiveSplit> builder = ImmutableList.builder(); boolean forceLocalScheduling = HiveSessionProperties.isForceLocalScheduling(session); if (splittable) { for (BlockLocation blockLocation : blockLocations) { // get the addresses for the block List<HostAddress> addresses = toHostAddress(blockLocation.getHosts()); long maxBytes = maxSplitSize.toBytes(); boolean creatingInitialSplits = false; if (remainingInitialSplits.get() > 0) { maxBytes = maxInitialSplitSize.toBytes(); creatingInitialSplits = true; }//from w ww . j a v a2 s . c o m // divide the block into uniform chunks that are smaller than the max split size int chunks = Math.max(1, (int) (blockLocation.getLength() / maxBytes)); // when block does not divide evenly into chunks, make the chunk size slightly bigger than necessary long targetChunkSize = (long) Math.ceil(blockLocation.getLength() * 1.0 / chunks); long chunkOffset = 0; while (chunkOffset < blockLocation.getLength()) { if (remainingInitialSplits.decrementAndGet() < 0 && creatingInitialSplits) { creatingInitialSplits = false; // recalculate the target chunk size maxBytes = maxSplitSize.toBytes(); long remainingLength = blockLocation.getLength() - chunkOffset; chunks = Math.max(1, (int) (remainingLength / maxBytes)); targetChunkSize = (long) Math.ceil(remainingLength * 1.0 / chunks); } // adjust the actual chunk size to account for the overrun when chunks are slightly bigger than necessary (see above) long chunkLength = Math.min(targetChunkSize, blockLocation.getLength() - chunkOffset); builder.add(new HiveSplit(connectorId, table.getDatabaseName(), table.getTableName(), partitionName, path, blockLocation.getOffset() + chunkOffset, chunkLength, schema, partitionKeys, addresses, bucketNumber, forceLocalScheduling && hasRealAddress(addresses), effectivePredicate, columnCoercions)); chunkOffset += chunkLength; } checkState(chunkOffset == blockLocation.getLength(), "Error splitting blocks"); } } else { // not splittable, use the hosts from the first block if it exists List<HostAddress> addresses = ImmutableList.of(); if (blockLocations.length > 0) { addresses = toHostAddress(blockLocations[0].getHosts()); } builder.add(new HiveSplit(connectorId, table.getDatabaseName(), table.getTableName(), partitionName, path, start, length, schema, partitionKeys, addresses, bucketNumber, forceLocalScheduling && hasRealAddress(addresses), effectivePredicate, columnCoercions)); } return builder.build(); }
From source file:com.facebook.presto.hive.HiveSplitIterable.java
License:Apache License
private List<HiveSplit> createHiveSplits(String partitionName, FileStatus file, BlockLocation[] blockLocations, long start, long length, Properties schema, List<HivePartitionKey> partitionKeys, boolean splittable) throws IOException { ImmutableList.Builder<HiveSplit> builder = ImmutableList.builder(); if (splittable) { for (BlockLocation blockLocation : blockLocations) { // get the addresses for the block List<HostAddress> addresses = toHostAddress(blockLocation.getHosts()); // divide the block into uniform chunks that are smaller than the max split size int chunks = Math.max(1, (int) (blockLocation.getLength() / maxSplitSize.toBytes())); // when block does not divide evenly into chunks, make the chunk size slightly bigger than necessary long targetChunkSize = (long) Math.ceil(blockLocation.getLength() * 1.0 / chunks); long chunkOffset = 0; while (chunkOffset < blockLocation.getLength()) { // adjust the actual chunk size to account for the overrun when chunks are slightly bigger than necessary (see above) long chunkLength = Math.min(targetChunkSize, blockLocation.getLength() - chunkOffset); builder.add(new HiveSplit(clientId, table.getDbName(), table.getTableName(), partitionName, false, file.getPath().toString(), blockLocation.getOffset() + chunkOffset, chunkLength, schema, partitionKeys, addresses)); chunkOffset += chunkLength; }/*from w w w. j av a 2s . c o m*/ checkState(chunkOffset == blockLocation.getLength(), "Error splitting blocks"); } } else { // not splittable, use the hosts from the first block builder.add(new HiveSplit(clientId, table.getDbName(), table.getTableName(), partitionName, false, file.getPath().toString(), start, length, schema, partitionKeys, toHostAddress(blockLocations[0].getHosts()))); } return builder.build(); }
From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java
License:Apache License
private List<HiveSplit> createHiveSplits(String partitionName, FileStatus file, BlockLocation[] blockLocations, long start, long length, Properties schema, List<HivePartitionKey> partitionKeys, boolean splittable, ConnectorSession session) throws IOException { ImmutableList.Builder<HiveSplit> builder = ImmutableList.builder(); if (splittable) { for (BlockLocation blockLocation : blockLocations) { // get the addresses for the block List<HostAddress> addresses = toHostAddress(blockLocation.getHosts()); long maxBytes = maxSplitSize.toBytes(); if (remainingInitialSplits > 0) { maxBytes = maxInitialSplitSize.toBytes(); }/*from w ww.j av a2 s. co m*/ // divide the block into uniform chunks that are smaller than the max split size int chunks = Math.max(1, (int) (blockLocation.getLength() / maxBytes)); // when block does not divide evenly into chunks, make the chunk size slightly bigger than necessary long targetChunkSize = (long) Math.ceil(blockLocation.getLength() * 1.0 / chunks); long chunkOffset = 0; while (chunkOffset < blockLocation.getLength()) { // adjust the actual chunk size to account for the overrun when chunks are slightly bigger than necessary (see above) long chunkLength = Math.min(targetChunkSize, blockLocation.getLength() - chunkOffset); builder.add(new HiveSplit(connectorId, table.getDbName(), table.getTableName(), partitionName, file.getPath().toString(), blockLocation.getOffset() + chunkOffset, chunkLength, schema, partitionKeys, addresses, session)); chunkOffset += chunkLength; remainingInitialSplits--; } checkState(chunkOffset == blockLocation.getLength(), "Error splitting blocks"); } } else { // not splittable, use the hosts from the first block if it exists List<HostAddress> addresses = ImmutableList.of(); if (blockLocations.length > 0) { addresses = toHostAddress(blockLocations[0].getHosts()); } builder.add(new HiveSplit(connectorId, table.getDbName(), table.getTableName(), partitionName, file.getPath().toString(), start, length, schema, partitionKeys, addresses, session)); } return builder.build(); }
From source file:com.facebook.presto.hive.util.InternalHiveSplitFactory.java
License:Apache License
private Optional<InternalHiveSplit> createInternalHiveSplit(Path path, BlockLocation[] blockLocations, long start, long length, long fileSize, OptionalInt bucketNumber, boolean splittable) { String pathString = path.toString(); if (!pathMatchesPredicate(pathDomain, pathString)) { return Optional.empty(); }/*from w ww .j a v a 2 s . c o m*/ boolean forceLocalScheduling = this.forceLocalScheduling; // For empty files, some filesystem (e.g. LocalFileSystem) produce one empty block // while others (e.g. hdfs.DistributedFileSystem) produces no block. // Synthesize an empty block if one does not already exist. if (fileSize == 0 && blockLocations.length == 0) { blockLocations = new BlockLocation[] { new BlockLocation() }; // Turn off force local scheduling because hosts list doesn't exist. forceLocalScheduling = false; } ImmutableList.Builder<InternalHiveBlock> blockBuilder = ImmutableList.builder(); for (BlockLocation blockLocation : blockLocations) { // clamp the block range long blockStart = Math.max(start, blockLocation.getOffset()); long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength()); if (blockStart > blockEnd) { // block is outside split range continue; } if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) { // skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval. continue; } blockBuilder.add(new InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation))); } List<InternalHiveBlock> blocks = blockBuilder.build(); checkBlocks(blocks, start, length); if (!splittable) { // not splittable, use the hosts from the first block if it exists blocks = ImmutableList.of(new InternalHiveBlock(start, start + length, blocks.get(0).getAddresses())); } return Optional.of(new InternalHiveSplit(partitionName, pathString, start, start + length, fileSize, schema, partitionKeys, blocks, bucketNumber, splittable, forceLocalScheduling && allBlocksHaveRealAddress(blocks), columnCoercions, bucketConversion, s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path))); }
From source file:com.google.mr4c.hadoop.DataLocalizer.java
License:Open Source License
public List<String> localize(Collection<DataFileSource> sources) throws IOException { List<BlockLocation> allBlocks = new ArrayList<BlockLocation>(); long totalSize = 0; for (DataFileSource src : sources) { BlockLocation[] blocks = src.getBlockLocation(); allBlocks.addAll(Arrays.asList(blocks)); for (BlockLocation block : blocks) { totalSize += block.getLength(); }/* w w w . j av a2 s . co m*/ } return Arrays.asList(m_calc.calcSplitHosts(allBlocks.toArray(new BlockLocation[allBlocks.size()]), 0, totalSize, m_topo)); }
From source file:com.linkedin.cubert.io.rubix.RubixInputSplit.java
License:Open Source License
@Override public String[] getLocations() throws IOException, InterruptedException { if (hostnames == null) { /* Obtain the FileSystem object and get the FileStatus objects for the split */ FileSystem fileSystem = FileSystem.get(conf); FileStatus fileStatus = fileSystem.getFileStatus(filename); /*//from w ww .j a v a 2 s . c om * Obtain the Block locations for the split. This also provides the offset and * length information for each block */ final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, offset, length); /** * Collect all hosts in a map and populate the number of bytes to be read from * each host */ Long l; Map<String, Long> hostMap = new HashMap<String, Long>(); for (BlockLocation bl : blockLocations) { final long start = bl.getOffset() < offset ? offset : bl.getOffset(); final long end = (offset + length) < (bl.getOffset() + bl.getLength()) ? offset + length : bl.getOffset() + bl.getLength(); final long nRelevantBytes = end - start; for (String host : bl.getHosts()) { hostMap.put(host, ((l = hostMap.get(host)) == null ? 0 : l) + nRelevantBytes); } } /* Sort them in decreasing order of maximum number of relevant bytes */ final Set<Map.Entry<String, Long>> entries = hostMap.entrySet(); final Map.Entry<String, Long>[] hostLengthPairs = entries.toArray(new Map.Entry[entries.size()]); Arrays.sort(hostLengthPairs, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Map.Entry<String, Long> e1, Map.Entry<String, Long> e2) { return (int) (e2.getValue() - e1.getValue()); } }); /* Populate the hostnames object */ final int nHost = Math.min(hostLengthPairs.length, MAX_LOCATIONS); hostnames = new String[nHost]; for (int i = 0; i < nHost; ++i) { hostnames[i] = hostLengthPairs[i].getKey(); } } return hostnames; }
From source file:com.mongodb.hadoop.splitter.BSONSplitter.java
License:Apache License
public static int getLargestBlockIndex(final BlockLocation[] blockLocations) { int retVal = -1; if (blockLocations == null) { return retVal; }//from ww w. ja v a 2 s.c om long max = 0; for (int i = 0; i < blockLocations.length; i++) { BlockLocation blk = blockLocations[i]; if (blk.getLength() > max) { retVal = i; } } return retVal; }
From source file:com.ricemap.spateDB.mapred.IndexedPrism.java
License:Apache License
@SuppressWarnings("unchecked") @Override/* w w w .j a v a 2s . co m*/ public InputSplit[] getSplits(final JobConf job, int numSplits) throws IOException { // Get a list of all input files. There should be exactly two files. final Path[] inputFiles = getInputPaths(job); GlobalIndex<Partition> gIndexes[] = new GlobalIndex[inputFiles.length]; BlockFilter blockFilter = null; try { Class<? extends BlockFilter> blockFilterClass = job.getClass(SpatialSite.FilterClass, null, BlockFilter.class); if (blockFilterClass != null) { // Get all blocks the user wants to process blockFilter = blockFilterClass.newInstance(); blockFilter.configure(job); } } catch (InstantiationException e1) { e1.printStackTrace(); } catch (IllegalAccessException e1) { e1.printStackTrace(); } if (blockFilter != null) { // Extract global indexes from input files for (int i_file = 0; i_file < inputFiles.length; i_file++) { FileSystem fs = inputFiles[i_file].getFileSystem(job); gIndexes[i_file] = SpatialSite.getGlobalIndex(fs, inputFiles[i_file]); } } final Vector<CombineFileSplit> matchedSplits = new Vector<CombineFileSplit>(); if (gIndexes[0] == null || gIndexes[1] == null) { // Join every possible pair (Cartesian product) BlockLocation[][] fileBlockLocations = new BlockLocation[inputFiles.length][]; for (int i_file = 0; i_file < inputFiles.length; i_file++) { FileSystem fs = inputFiles[i_file].getFileSystem(job); FileStatus fileStatus = fs.getFileStatus(inputFiles[i_file]); fileBlockLocations[i_file] = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); } LOG.info("Doing a Cartesian product of blocks: " + fileBlockLocations[0].length + "x" + fileBlockLocations[1].length); for (BlockLocation block1 : fileBlockLocations[0]) { for (BlockLocation block2 : fileBlockLocations[1]) { FileSplit fsplit1 = new FileSplit(inputFiles[0], block1.getOffset(), block1.getLength(), block1.getHosts()); FileSplit fsplit2 = new FileSplit(inputFiles[1], block2.getOffset(), block2.getLength(), block2.getHosts()); CombineFileSplit combinedSplit = (CombineFileSplit) FileSplitUtil.combineFileSplits(job, fsplit1, fsplit2); matchedSplits.add(combinedSplit); } } } else { // Filter block pairs by the BlockFilter blockFilter.selectCellPairs(gIndexes[0], gIndexes[1], new ResultCollector2<Partition, Partition>() { @Override public void collect(Partition p1, Partition p2) { try { List<FileSplit> splits1 = new ArrayList<FileSplit>(); Path path1 = new Path(inputFiles[0], p1.filename); splitFile(job, path1, splits1); List<FileSplit> splits2 = new ArrayList<FileSplit>(); Path path2 = new Path(inputFiles[1], p2.filename); splitFile(job, path2, splits2); for (FileSplit split1 : splits1) { for (FileSplit split2 : splits2) { matchedSplits.add( (CombineFileSplit) FileSplitUtil.combineFileSplits(job, split1, split2)); } } } catch (IOException e) { e.printStackTrace(); } } }); } LOG.info("Matched " + matchedSplits.size() + " combine splits"); // Return all matched splits return matchedSplits.toArray(new InputSplit[matchedSplits.size()]); }
From source file:com.zjy.mongo.splitter.BSONSplitter.java
License:Apache License
/** * Get the index of the block within the given BlockLocations that * contains the given offset. Raises IllegalArgumentException if the * offset is outside the file.//from w w w . jav a2 s . co m * * @param blockLocations BlockLocations to search. * @param offset the offset into the file. * @return the index of the BlockLocation containing the offset. */ private static int getBlockIndex(final BlockLocation[] blockLocations, final long offset) { for (int i = 0; i < blockLocations.length; i++) { BlockLocation bl = blockLocations[i]; if (bl.getOffset() <= offset && offset < bl.getOffset() + bl.getLength()) { return i; } } BlockLocation lastBlock = blockLocations[blockLocations.length - 1]; long fileLength = lastBlock.getOffset() + lastBlock.getLength() - 1; throw new IllegalArgumentException( String.format("Offset %d is outside the file [0..%d].", offset, fileLength)); }
From source file:de.huberlin.wbi.hiway.common.Data.java
License:Apache License
long countAvailableLocalData(Container container) throws IOException { BlockLocation[] blockLocations = null; Path hdfsLocation = getHdfsPath(); while (blockLocations == null) { FileStatus fileStatus = hdfs.getFileStatus(hdfsLocation); blockLocations = hdfs.getFileBlockLocations(hdfsLocation, 0, fileStatus.getLen()); }/*w w w . j a v a2 s .c o m*/ long sum = 0; for (BlockLocation blockLocation : blockLocations) { for (String host : blockLocation.getHosts()) { if (container.getNodeId().getHost().equals(host)) { sum += blockLocation.getLength(); break; } } } return sum; }