List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java
License:Apache License
/** * Get the host affinity for a row group. * * @param fileStatus the parquet file//w w w . jav a 2 s. c o m * @param start the start of the row group * @param length the length of the row group * @return host affinity for the row group */ private Map<String, Float> getHostAffinity(FileStatus fileStatus, FileSystem fs, long start, long length) throws IOException { BlockLocation[] blockLocations = fs.getFileBlockLocations(fileStatus, start, length); Map<String, Float> hostAffinityMap = Maps.newHashMap(); for (BlockLocation blockLocation : blockLocations) { for (String host : blockLocation.getHosts()) { Float currentAffinity = hostAffinityMap.get(host); float blockStart = blockLocation.getOffset(); float blockEnd = blockStart + blockLocation.getLength(); float rowGroupEnd = start + length; Float newAffinity = (blockLocation.getLength() - (blockStart < start ? start - blockStart : 0) - (blockEnd > rowGroupEnd ? blockEnd - rowGroupEnd : 0)) / length; if (currentAffinity != null) { hostAffinityMap.put(host, currentAffinity + newAffinity); } else { hostAffinityMap.put(host, newAffinity); } } } return hostAffinityMap; }
From source file:org.apache.giraph.io.formats.GiraphFileInputFormat.java
License:Apache License
/** * Common method for generating the list of vertex/edge input splits. * * @param job The job/*from www. j a va2 s. c o m*/ * @param files Array of FileStatus objects for vertex/edge input files * @return The list of vertex/edge input splits * @throws IOException */ private List<InputSplit> getSplits(JobContext job, List<FileStatus> files) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } return splits; }
From source file:org.apache.hama.bsp.FileInputFormat.java
License:Apache License
/** * Splits files returned by {@link #listStatus(BSPJob)} when they're too big. <br/> * numSplits will be ignored by the framework. *///www.j a v a 2s . com @Override public InputSplit[] getSplits(BSPJob job, int numSplits) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(job); /* * TODO: This does not consider data locality. When the numSplits * (user-defined) is equal to or smaller than the number of DFS splits, we * should assign multiple splits to a task. */ // take the short circuit path if we have already partitioned // if (numSplits == files.length) { // for (FileStatus file : files) { // if (file != null) { // splits.add(new FileSplit(file.getPath(), 0, file.getLen(), // new String[0])); // } // } // return splits.toArray(new FileSplit[splits.size()]); // } for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } // Save the number of input files in the job-conf job.getConfiguration().setLong("bsp.input.files", files.length); LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new InputSplit[splits.size()]); }
From source file:org.apache.parquet.hadoop.ParquetInputFormat.java
License:Apache License
List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers, long maxSplitSize, long minSplitSize, ReadContext readContext) throws IOException { List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); Filter filter = ParquetInputFormat.getFilter(configuration); long rowGroupsDropped = 0; long totalRowGroups = 0; for (Footer footer : footers) { final Path file = footer.getFile(); LOG.debug(file);//from w w w. jav a 2s . com FileSystem fs = file.getFileSystem(configuration); FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks(); List<BlockMetaData> filteredBlocks; totalRowGroups += blocks.size(); filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema()); rowGroupsDropped += blocks.size() - filteredBlocks.size(); if (filteredBlocks.isEmpty()) { continue; } BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); splits.addAll(generateSplits(filteredBlocks, fileBlockLocations, fileStatus, readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize, maxSplitSize)); } if (rowGroupsDropped > 0 && totalRowGroups > 0) { int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100); LOG.info("Dropping " + rowGroupsDropped + " row groups that do not pass filter predicate! (" + percentDropped + "%)"); } else { LOG.info("There were no row groups that could be dropped due to filter predicates"); } return splits; }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputSplitFormat.java
License:Apache License
/** * This function returns the sample split due to the chunksize * and the rate./*from ww w. ja v a 2s . co m*/ * @param job * @return * @throws IOException */ public List<InputSplit> getSplitsSample(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); float rate = job.getConfiguration().getFloat(PigConfiguration.PIG_H2IRG_ROLLUP_RATE, 0); double maximumSamplingSize = 0; String inputFile = job.getConfiguration().get("pig.input.dirs", ""); if (inputFile != "") { Path pPivot = new Path(inputFile); FileSystem fs = FileSystem.get(job.getConfiguration()); FileStatus stt = fs.getFileStatus(pPivot); long fileLength = stt.getLen(); maximumSamplingSize = fileLength * rate; } ArrayList<ArrayList<InputSplit>> splitArray = null; int noOfSizes = 0; String splitSizes[] = null; long preVariableSizes[] = null; String variableSplit = job.getConfiguration().get("pig.h2irg.rollup.variablesplit", ""); if (variableSplit.equals("")) { noOfSizes = 4; preVariableSizes = new long[4]; preVariableSizes[0] = 256; preVariableSizes[1] = 512; preVariableSizes[2] = 1024; preVariableSizes[3] = 2048; } else { splitSizes = variableSplit.split(","); noOfSizes = splitSizes.length; preVariableSizes = new long[noOfSizes]; for (int i = 0; i < noOfSizes; i++) { preVariableSizes[i] = Long.parseLong(splitSizes[i]); } } long postVariableSizes[] = new long[noOfSizes]; long oneKB = 1024; for (int i = 0; i < noOfSizes; i++) { postVariableSizes[i] = preVariableSizes[i] * oneKB; } splitArray = new ArrayList<ArrayList<InputSplit>>(); for (int i = 0; i < noOfSizes + 1; i++) { ArrayList<InputSplit> single = new ArrayList<InputSplit>(); splitArray.add(single); } for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long bytesRemaining = length; long totalBytes = 0; int count = noOfSizes; for (int i = 0; i < noOfSizes; i++) { while (totalBytes < length / noOfSizes) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); addSplit(splitArray.get(i), new FileSplit(path, length - bytesRemaining, postVariableSizes[i], blkLocations[blkIndex].getHosts())); bytesRemaining -= postVariableSizes[i]; totalBytes += postVariableSizes[i]; } count--; bytesRemaining = count * (length / noOfSizes); totalBytes = 0; } if (bytesRemaining != 0) { addSplit(splitArray.get(splitArray.size() - 1), new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { addSplit(splitArray.get(splitArray.size() - 1), new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { addSplit(splitArray.get(splitArray.size() - 1), new FileSplit(path, 0, length, new String[0])); } } for (int i = 0; i < noOfSizes; i++) { log.info("Total # of " + postVariableSizes[i] + " splits: " + splitArray.get(i).size()); Collections.shuffle(splitArray.get(i)); } List<InputSplit> splitsReturn = new ArrayList<InputSplit>(); for (int i = 0; i < noOfSizes; i++) { int noSampleSplit = (int) Math.ceil(rate * splitArray.get(i).size()); if (noSampleSplit == 0) noSampleSplit = 1; for (int j = 0; j < noSampleSplit; j++) { splitsReturn.add(splitArray.get(i).get(j)); } } log.info("Total # of sampling splits: " + splitsReturn.size()); return splitsReturn; }
From source file:org.apache.rya.accumulo.mr.AccumuloHDFSFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { //read the params from AccumuloInputFormat Configuration conf = jobContext.getConfiguration(); Instance instance = MRUtils.AccumuloProps.getInstance(jobContext); String user = MRUtils.AccumuloProps.getUsername(jobContext); AuthenticationToken password = MRUtils.AccumuloProps.getPassword(jobContext); String table = MRUtils.AccumuloProps.getTablename(jobContext); ArgumentChecker.notNull(instance);/*w ww. j a v a 2 s . com*/ ArgumentChecker.notNull(table); //find the files necessary try { Connector connector = instance.getConnector(user, password); TableOperations tos = connector.tableOperations(); String tableId = tos.tableIdMap().get(table); Scanner scanner = connector.createScanner("accumulo.metadata", Authorizations.EMPTY); //TODO: auths? scanner.setRange(new Range(new Text(tableId + "\u0000"), new Text(tableId + "\uFFFD"))); scanner.fetchColumnFamily(new Text("file")); List<String> files = new ArrayList<String>(); List<InputSplit> fileSplits = new ArrayList<InputSplit>(); for (Map.Entry<Key, Value> entry : scanner) { String file = entry.getKey().getColumnQualifier().toString(); Path path = new Path(file); FileSystem fs = path.getFileSystem(conf); FileStatus fileStatus = fs.getFileStatus(path); long len = fileStatus.getLen(); BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, len); files.add(file); fileSplits.add(new FileSplit(path, 0, len, fileBlockLocations[0].getHosts())); } System.out.println(files); return fileSplits; } catch (Exception e) { throw new IOException(e); } }
From source file:org.apache.solr.store.hdfs.HdfsLocalityReporter.java
License:Apache License
/** * Update the cached block locations for the given directory. This includes deleting any files that no longer exist in * the file system and adding any new files that have shown up. * /*w w w . j a v a 2s .c o m*/ * @param dir * The directory to refresh * @throws IOException * If there is a problem getting info from HDFS */ private void refreshDirectory(HdfsDirectory dir) throws IOException { Map<FileStatus, BlockLocation[]> directoryCache = cache.get(dir); Set<FileStatus> cachedStatuses = directoryCache.keySet(); FileSystem fs = dir.getFileSystem(); FileStatus[] statuses = fs.listStatus(dir.getHdfsDirPath()); List<FileStatus> statusList = Arrays.asList(statuses); logger.debug("Updating locality information for: {}", statusList); // Keep only the files that still exist cachedStatuses.retainAll(statusList); // Fill in missing entries in the cache for (FileStatus status : statusList) { if (!status.isDirectory() && !directoryCache.containsKey(status)) { BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen()); directoryCache.put(status, locations); } } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java
License:Apache License
/** * Get the list of hostnames where the input split is located. *//* w w w. ja v a 2 s.c om*/ @Override public String[] getLocations() throws IOException { //Timing time = new Timing(); //time.start(); JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = IOUtilFunctions.getFileSystem(getPath(), job); //read task string LongWritable key = new LongWritable(); Text value = new Text(); RecordReader<LongWritable, Text> reader = null; try { reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL); reader.next(key, value); } finally { IOUtilFunctions.closeSilently(reader); } //parse task Task t = Task.parseCompactString(value.toString()); //get all locations HashMap<String, Integer> hosts = new HashMap<>(); if (t.getType() == TaskType.SET) { for (IntObject val : t.getIterations()) { String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } } else //TaskType.RANGE { //since this is a serial process, we use just the first iteration //as a heuristic for location information long lFrom = t.getIterations().get(0).getLongValue(); long lTo = t.getIterations().get(1).getLongValue(); for (long li : new long[] { lFrom, lTo }) { String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } } //majority consensus on top host return getTopHosts(hosts); }
From source file:org.apache.tajo.storage.AbstractStorageManager.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * * @throws IOException/*w w w. j a va 2 s . c om*/ */ public List<FileFragment> getSplits(String tableName, TableMeta meta, Schema schema, Path... inputs) throws IOException { // generate splits' List<FileFragment> splits = Lists.newArrayList(); List<FileFragment> volumeSplits = Lists.newArrayList(); List<BlockLocation> blockLocations = Lists.newArrayList(); for (Path p : inputs) { FileSystem fs = p.getFileSystem(conf); ArrayList<FileStatus> files = Lists.newArrayList(); if (fs.isFile(p)) { files.addAll(Lists.newArrayList(fs.getFileStatus(p))); } else { files.addAll(listStatus(p)); } int previousSplitSize = splits.size(); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length > 0) { // Get locations of blocks of file BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); boolean splittable = isSplittable(meta, schema, path, file); if (blocksMetadataEnabled && fs instanceof DistributedFileSystem) { if (splittable) { for (BlockLocation blockLocation : blkLocations) { volumeSplits.add(makeSplit(tableName, path, blockLocation)); } blockLocations.addAll(Arrays.asList(blkLocations)); } else { // Non splittable long blockSize = blkLocations[0].getLength(); if (blockSize >= length) { blockLocations.addAll(Arrays.asList(blkLocations)); for (BlockLocation blockLocation : blkLocations) { volumeSplits.add(makeSplit(tableName, path, blockLocation)); } } else { splits.add(makeNonSplit(tableName, path, 0, length, blkLocations)); } } } else { if (splittable) { long minSize = Math.max(getMinSplitSize(), 1); long blockSize = file.getBlockSize(); // s3n rest api contained block size but blockLocations is one long splitSize = Math.max(minSize, blockSize); long bytesRemaining = length; // for s3 while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(tableName, path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining > 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(tableName, path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // Non splittable splits.add(makeNonSplit(tableName, path, 0, length, blkLocations)); } } } else { //for zero length files splits.add(makeSplit(tableName, path, 0, length)); } } if (LOG.isDebugEnabled()) { LOG.debug("# of splits per partition: " + (splits.size() - previousSplitSize)); } } // Combine original fileFragments with new VolumeId information setVolumeMeta(volumeSplits, blockLocations); splits.addAll(volumeSplits); LOG.info("Total # of splits: " + splits.size()); return splits; }
From source file:org.apache.tajo.storage.FileStorageManager.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * * @throws IOException// w ww. j a v a 2 s .co m */ public List<Fragment> getSplits(String tableName, TableMeta meta, Schema schema, Path... inputs) throws IOException { // generate splits' List<Fragment> splits = Lists.newArrayList(); List<Fragment> volumeSplits = Lists.newArrayList(); List<BlockLocation> blockLocations = Lists.newArrayList(); for (Path p : inputs) { FileSystem fs = p.getFileSystem(conf); ArrayList<FileStatus> files = Lists.newArrayList(); if (fs.isFile(p)) { files.addAll(Lists.newArrayList(fs.getFileStatus(p))); } else { files.addAll(listStatus(p)); } int previousSplitSize = splits.size(); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length > 0) { // Get locations of blocks of file BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); boolean splittable = isSplittable(meta, schema, path, file); if (blocksMetadataEnabled && fs instanceof DistributedFileSystem) { if (splittable) { for (BlockLocation blockLocation : blkLocations) { volumeSplits.add(makeSplit(tableName, path, blockLocation)); } blockLocations.addAll(Arrays.asList(blkLocations)); } else { // Non splittable long blockSize = blkLocations[0].getLength(); if (blockSize >= length) { blockLocations.addAll(Arrays.asList(blkLocations)); for (BlockLocation blockLocation : blkLocations) { volumeSplits.add(makeSplit(tableName, path, blockLocation)); } } else { splits.add(makeNonSplit(tableName, path, 0, length, blkLocations)); } } } else { if (splittable) { long minSize = Math.max(getMinSplitSize(), 1); long blockSize = file.getBlockSize(); // s3n rest api contained block size but blockLocations is one long splitSize = Math.max(minSize, blockSize); long bytesRemaining = length; // for s3 while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(tableName, path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining > 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(tableName, path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // Non splittable splits.add(makeNonSplit(tableName, path, 0, length, blkLocations)); } } } else { //for zero length files splits.add(makeSplit(tableName, path, 0, length)); } } if (LOG.isDebugEnabled()) { LOG.debug("# of splits per partition: " + (splits.size() - previousSplitSize)); } } // Combine original fileFragments with new VolumeId information setVolumeMeta(volumeSplits, blockLocations); splits.addAll(volumeSplits); LOG.info("Total # of splits: " + splits.size()); return splits; }