Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:com.cloudera.GetBlockLocations.java

License:Apache License

public static void main(String[] args) throws Exception {
    final Configuration conf = new Configuration();
    String url = getStringOrDie("get.block.locations.path");
    final FileSystem fs = FileSystem.get(new URI(url), conf);

    if (!fs.exists(new Path(url))) {
        System.out.println("no file at " + url);
        System.exit(1);//from  ww  w .j  a v a2s  .  c  om
    }
    BlockLocation locs[] = null;
    try {
        locs = fs.getFileBlockLocations(new Path(url), 0, Long.MAX_VALUE);
    } catch (IOException e) {
        System.out.println("Error calling getFileBlockLocations(" + url + ")\n");
        e.printStackTrace(System.err);
        System.exit(1);
    }

    String prefix = "";
    for (BlockLocation loc : locs) {
        System.out.println(prefix);
        System.out.println("{");
        System.out.println("  hosts =         " + Arrays.toString(loc.getHosts()));
        System.out.println("  cachedHosts =   " + Arrays.toString(loc.getCachedHosts()));
        System.out.println("  names    =      " + Arrays.toString(loc.getNames()));
        System.out.println("  topologyPaths = " + Arrays.toString(loc.getTopologyPaths()));
        System.out.println("  offset =        " + loc.getOffset());
        System.out.println("  length =        " + loc.getLength());
        System.out.println("  corrupt =       " + loc.isCorrupt());
        System.out.println("}");
        prefix = ",";
    }
}

From source file:com.cloudera.impala.catalog.HdfsTable.java

License:Apache License

/**
 * Loads the file block metadata for the given collection of FileDescriptors.  The
 * FileDescriptors are passed as a tree, where the first level is indexed by
 * filesystem, the second level is indexed by partition location, and the leaves are
 * the list of files that exist under each directory.
 */// w w w.jav a  2s  .c o m
private void loadBlockMd(Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescs) throws RuntimeException {
    Preconditions.checkNotNull(perFsFileDescs);
    LOG.debug("load block md for " + name_);

    for (FsKey fsEntry : perFsFileDescs.keySet()) {
        FileSystem fs = fsEntry.filesystem;
        // Store all BlockLocations so they can be reused when loading the disk IDs.
        List<BlockLocation> blockLocations = Lists.newArrayList();
        int numCachedBlocks = 0;
        Map<String, List<FileDescriptor>> partitionToFds = perFsFileDescs.get(fsEntry);
        Preconditions.checkNotNull(partitionToFds);
        // loop over all files and record their block metadata, minus volume ids
        for (String partitionDir : partitionToFds.keySet()) {
            Path partDirPath = new Path(partitionDir);
            for (FileDescriptor fileDescriptor : partitionToFds.get(partitionDir)) {
                Path p = new Path(partDirPath, fileDescriptor.getFileName());
                try {
                    FileStatus fileStatus = fs.getFileStatus(p);
                    // fileDescriptors should not contain directories.
                    Preconditions.checkArgument(!fileStatus.isDirectory());
                    BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
                    Preconditions.checkNotNull(locations);
                    blockLocations.addAll(Arrays.asList(locations));

                    // Loop over all blocks in the file.
                    for (BlockLocation block : locations) {
                        String[] blockHostPorts = block.getNames();
                        try {
                            blockHostPorts = block.getNames();
                        } catch (IOException e) {
                            // this shouldn't happen, getNames() doesn't throw anything
                            String errorMsg = "BlockLocation.getNames() failed:\n" + e.getMessage();
                            LOG.error(errorMsg);
                            throw new IllegalStateException(errorMsg);
                        }
                        // Now enumerate all replicas of the block, adding any unknown hosts to
                        // hostIndex_ and the index for that host to replicaHostIdxs.
                        List<Integer> replicaHostIdxs = new ArrayList<Integer>(blockHostPorts.length);
                        for (int i = 0; i < blockHostPorts.length; ++i) {
                            String[] ip_port = blockHostPorts[i].split(":");
                            Preconditions.checkState(ip_port.length == 2);
                            TNetworkAddress network_address = new TNetworkAddress(ip_port[0],
                                    Integer.parseInt(ip_port[1]));
                            replicaHostIdxs.add(hostIndex_.getIndex(network_address));
                        }
                        fileDescriptor.addFileBlock(
                                new FileBlock(block.getOffset(), block.getLength(), replicaHostIdxs));
                    }
                } catch (IOException e) {
                    throw new RuntimeException(
                            "couldn't determine block locations for path '" + p + "':\n" + e.getMessage(), e);
                }
            }
        }

        if (SUPPORTS_VOLUME_ID && fs instanceof DistributedFileSystem) {
            LOG.trace("loading disk ids for: " + getFullName() + ". nodes: " + getNumNodes() + ". file system: "
                    + fsEntry);
            loadDiskIds((DistributedFileSystem) fs, blockLocations, partitionToFds);
            LOG.trace("completed load of disk ids for: " + getFullName());
        }
    }
}

From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java

License:Apache License

/**
 * List file status by calling fileSystem.listStatus.
 *///from w  w  w.  ja va2 s .co  m
private static void listStatus(String dirPath) {
    Path path = new Path(dirPath);
    boolean exceptionThrown = false;
    try {
        FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf());
        FileStatus[] fileStatus = fs.listStatus(path);
        if (fs.exists(path)) {
            for (FileStatus status : fileStatus) {
                BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen());
                for (BlockLocation loc : locations) {
                    loc.getNames();
                    loc.getHosts();
                }
            }
        }
    } catch (IOException e) {
        exceptionThrown = true;
        LOG.error("Failed to list Status", e);
    }
    assertFalse(exceptionThrown);
}

From source file:com.cloudera.impala.util.LoadMetadataUtil.java

License:Apache License

/**
 * Queries the filesystem to load the file block metadata (e.g. DFS blocks) for the
 * given file. Adds the newly created block metadata and block location to the
 * perFsFileBlocks, so that the storage IDs for each block can be retrieved from
 * BlockLocation./*from  w w  w  .ja  v a 2  s  .c om*/
 *
 * Must be threadsafe. Access to 'perFsFileBlocks' and 'hostIndex' must be protected.
 */
private static void loadBlockMetadata(FileSystem fs, FileStatus file, FileDescriptor fd,
        HdfsFileFormat fileFormat, Map<FsKey, FileBlocksInfo> perFsFileBlocks, String tblName,
        ListMap<TNetworkAddress> hostIndex) {
    Preconditions.checkNotNull(fd);
    Preconditions.checkNotNull(perFsFileBlocks);
    Preconditions.checkArgument(!file.isDirectory());
    LOG.debug("load block md for " + tblName + " file " + fd.getFileName());

    if (!FileSystemUtil.hasGetFileBlockLocations(fs)) {
        synthesizeBlockMetadata(file, fd, fileFormat, hostIndex);
        return;
    }
    try {
        BlockLocation[] locations = null;
        if (file instanceof LocatedFileStatus) {
            locations = ((LocatedFileStatus) file).getBlockLocations();
        } else {
            locations = fs.getFileBlockLocations(file, 0, file.getLen());
        }
        Preconditions.checkNotNull(locations);

        // Loop over all blocks in the file.
        for (BlockLocation loc : locations) {
            Preconditions.checkNotNull(loc);
            fd.addFileBlock(createFileBlock(loc, hostIndex));
        }

        // Remember the THdfsFileBlocks and corresponding BlockLocations. Once all the
        // blocks are collected, the disk IDs will be queried in one batch per filesystem.
        FsKey fsKey = new FsKey(fs);
        synchronized (perFsFileBlocks) {
            FileBlocksInfo infos = perFsFileBlocks.get(fsKey);
            if (infos == null) {
                infos = new FileBlocksInfo();
                perFsFileBlocks.put(fsKey, infos);
            }
            infos.addBlocks(fd.getFileBlocks(), Arrays.asList(locations));
        }
    } catch (IOException e) {
        throw new RuntimeException(
                "couldn't determine block locations for path '" + file.getPath() + "':\n" + e.getMessage(), e);
    }
}

From source file:com.cloudera.kitten.appmaster.util.HDFSFileFinder.java

License:Open Source License

public static Map<String, Long> getNumBytesOfGlobHeldByDatanodes(Path p, Configuration conf)
        throws IOException {
    FileSystem fs = p.getFileSystem(conf);

    HashMap<String, Long> bytesHeld = Maps.newHashMap();
    for (FileStatus f : fs.globStatus(p)) {
        BlockLocation[] bls = fs.getFileBlockLocations(p, 0, f.getLen());
        if (bls.length > 0) {
            for (BlockLocation bl : bls) {
                long l = bl.getLength();
                for (String name : bl.getNames()) {
                    if (bytesHeld.containsKey(name))
                        bytesHeld.put(name, bytesHeld.get(name) + l);
                    else
                        bytesHeld.put(name, l);
                }/*from   w w w .j a va  2  s. c  om*/
            }
        }
    }

    return bytesHeld;
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context//from  ww w.j ava 2  s  . co  m
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:com.edwardsit.spark4n6.EWFImageInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    log.setLevel(Level.DEBUG);/*from  w w w. jav a  2s .  c o m*/
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    BlockLocation[] blkLocations = null;
    Path path = null;
    FileSystem fs = null;
    EWFFileReader ewf = null;
    ArrayList<EWFSection.SectionPrefix> sections = null;
    Iterator<EWFSection.SectionPrefix> it = null;
    EWFSection.SectionPrefix sp = null;
    Path priorFile = null;
    long priorOffset = 0L;
    FileStatus priorFileStatus = null;
    chunkSize = new EWFSegmentFileReader(fs).DEFAULT_CHUNK_SIZE;
    long priorStart = 0L;
    int blkIndex = 0;
    for (FileStatus file : files) {
        path = file.getPath();
        fs = path.getFileSystem(job.getConfiguration());
        if (path.getName().endsWith(".E01")) {

            ewf = new EWFFileReader(fs, path);
            sections = ewf.getSectionPrefixArray();
            it = sections.iterator();
            while (it.hasNext()) {
                sp = it.next();
                if (sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) {
                    priorFileStatus = fs.getFileStatus(priorFile);
                    for (long i = sp.chunkCount; i > 0L; i = i - getChunksPerSplit(priorFileStatus)) {
                        if (priorFileStatus instanceof LocatedFileStatus) {
                            blkLocations = ((LocatedFileStatus) priorFileStatus).getBlockLocations();
                        } else {
                            blkLocations = fs.getFileBlockLocations(priorFileStatus, priorOffset,
                                    (getChunksPerSplit(priorFileStatus) * chunkSize));
                        }
                        blkIndex = getBlockIndex(blkLocations, priorOffset);
                        if (i > getChunksPerSplit(priorFileStatus)) {
                            log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize)
                                    + ", " + (getChunksPerSplit(priorFileStatus) * chunkSize) + ", "
                                    + listHosts(blkLocations, blkIndex) + ");");
                            splits.add(makeSplit(priorFile, (priorStart * chunkSize),
                                    (getChunksPerSplit(priorFileStatus) * chunkSize),
                                    blkLocations[blkIndex].getHosts()));
                            priorStart += getChunksPerSplit(priorFileStatus);
                        } else {
                            log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize)
                                    + ", " + (i * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");");
                            splits.add(makeSplit(priorFile, (priorStart * chunkSize), (i * chunkSize),
                                    blkLocations[blkIndex].getHosts()));
                            priorStart += i;
                        }
                    }
                }
                priorFile = sp.file;
                priorOffset = sp.fileOffset;
            }
        }
    }
    return splits;
}

From source file:com.facebook.presto.hive.BackgroundHiveSplitLoader.java

License:Apache License

private void loadPartition(HivePartitionMetadata partition) throws IOException {
    String partitionName = partition.getHivePartition().getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();

    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    Configuration configuration = hdfsEnvironment.getConfiguration(path);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
    FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);

    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (bucketHandle.isPresent()) {
            throw new PrestoException(StandardErrorCode.NOT_SUPPORTED,
                    "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }//w  w  w. j ava 2  s  .c om

        // TODO: This should use an iterator like the HiveFileIterator
        for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
            // The input should be in TextInputFormat.
            TextInputFormat targetInputFormat = new TextInputFormat();
            // get the configuration for the target path -- it may be a different hdfs instance
            Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
            JobConf targetJob = new JobConf(targetConfiguration);
            targetJob.setInputFormat(TextInputFormat.class);
            targetInputFormat.configure(targetJob);
            FileInputFormat.setInputPaths(targetJob, targetPath);
            InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);

            for (InputSplit inputSplit : targetSplits) {
                FileSplit split = (FileSplit) inputSplit;
                FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(session.getUser(), split.getPath());
                FileStatus file = targetFilesystem.getFileStatus(split.getPath());
                hiveSplitSource.addToQueue(createHiveSplits(partitionName, file.getPath().toString(),
                        targetFilesystem.getFileBlockLocations(file, split.getStart(), split.getLength()),
                        split.getStart(), split.getLength(), schema, partitionKeys, false, session,
                        OptionalInt.empty(), effectivePredicate, partition.getColumnCoercions()));
                if (stopped) {
                    return;
                }
            }
        }
        return;
    }

    // If only one bucket could match: load that one file
    HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName,
            inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
    if (!buckets.isEmpty()) {
        int bucketCount = buckets.get(0).getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);

        for (HiveBucket bucket : buckets) {
            int bucketNumber = bucket.getBucketNumber();
            LocatedFileStatus file = list.get(bucketNumber);
            boolean splittable = isSplittable(iterator.getInputFormat(),
                    hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());

            hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(),
                    file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(),
                    iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber),
                    effectivePredicate, partition.getColumnCoercions()));
        }

        return;
    }

    // If table is bucketed: list the directory, sort, tag with bucket id
    if (bucketHandle.isPresent()) {
        // HiveFileIterator skips hidden files automatically.
        int bucketCount = bucketHandle.get().getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);

        for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
            LocatedFileStatus file = list.get(bucketIndex);
            boolean splittable = isSplittable(iterator.getInputFormat(),
                    hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());

            hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(),
                    file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(),
                    iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex),
                    iterator.getEffectivePredicate(), partition.getColumnCoercions()));
        }

        return;
    }

    fileIterators.addLast(iterator);
}

From source file:com.facebook.presto.hive.HiveSplitIterable.java

License:Apache License

private void loadPartitionSplits(final HiveSplitQueue hiveSplitQueue, SuspendingExecutor suspendingExecutor)
        throws InterruptedException {
    final Semaphore semaphore = new Semaphore(maxPartitionBatchSize);
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) {
        ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder();

        Iterator<String> nameIterator = partitionNames.iterator();
        for (Partition partition : partitions) {
            checkState(nameIterator.hasNext(), "different number of partitions and partition names!");
            semaphore.acquire();/* w  w w  .  jav a  2s . c o  m*/
            final String partitionName = nameIterator.next();
            final Properties schema = getPartitionSchema(table, partition);
            final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition);

            Path path = new Path(getPartitionLocation(table, partition));
            final Configuration configuration = hdfsEnvironment.getConfiguration(path);
            final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
            Path partitionPath = hdfsEnvironment.getFileSystemWrapper().wrap(path);

            FileSystem fs = partitionPath.getFileSystem(configuration);
            final LastSplitMarkingQueue markerQueue = new LastSplitMarkingQueue(hiveSplitQueue);

            if (inputFormat instanceof SymlinkTextInputFormat) {
                JobConf jobConf = new JobConf(configuration);
                FileInputFormat.setInputPaths(jobConf, partitionPath);
                InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
                for (InputSplit rawSplit : splits) {
                    FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit)
                            .getTargetSplit();

                    // get the filesystem for the target path -- it may be a different hdfs instance
                    FileSystem targetFilesystem = split.getPath().getFileSystem(configuration);
                    FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath());
                    markerQueue.addToQueue(createHiveSplits(partitionName, fileStatus,
                            targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(),
                                    split.getLength()),
                            split.getStart(), split.getLength(), schema, partitionKeys, false));
                }
                markerQueue.finish();
                continue;
            }

            ListenableFuture<Void> partitionFuture = new AsyncRecursiveWalker(fs, suspendingExecutor)
                    .beginWalk(partitionPath, new FileStatusCallback() {
                        @Override
                        public void process(FileStatus file, BlockLocation[] blockLocations) {
                            if (bucket.isPresent()
                                    && !fileMatchesBucket(file.getPath().getName(), bucket.get())) {
                                return;
                            }

                            try {
                                boolean splittable = isSplittable(inputFormat,
                                        file.getPath().getFileSystem(configuration), file.getPath());

                                markerQueue.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0,
                                        file.getLen(), schema, partitionKeys, splittable));
                            } catch (IOException e) {
                                hiveSplitQueue.fail(e);
                            }
                        }
                    });

            // release the semaphore when the partition finishes
            Futures.addCallback(partitionFuture, new FutureCallback<Void>() {
                @Override
                public void onSuccess(Void result) {
                    markerQueue.finish();
                    semaphore.release();
                }

                @Override
                public void onFailure(Throwable t) {
                    markerQueue.finish();
                    semaphore.release();
                }
            });
            futureBuilder.add(partitionFuture);
        }

        // when all partitions finish, mark the queue as finished
        Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() {
            @Override
            public void onSuccess(List<Void> result) {
                hiveSplitQueue.finished();
            }

            @Override
            public void onFailure(Throwable t) {
                hiveSplitQueue.fail(t);
            }
        });
    } catch (Throwable e) {
        hiveSplitQueue.fail(e);
        Throwables.propagateIfInstanceOf(e, Error.class);
    }
}

From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java

License:Apache License

private void loadPartitionSplits(final HiveSplitSource hiveSplitSource, SuspendingExecutor suspendingExecutor,
        final ConnectorSession session) {
    final Semaphore semaphore = new Semaphore(maxPartitionBatchSize);
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) {
        ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder();

        Iterator<String> nameIterator = partitionNames.iterator();
        for (Partition partition : partitions) {
            checkState(nameIterator.hasNext(), "different number of partitions and partition names!");
            final String partitionName = nameIterator.next();
            final Properties schema = getPartitionSchema(table, partition);
            final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition);

            Path path = new Path(getPartitionLocation(table, partition));
            final Configuration configuration = hdfsEnvironment.getConfiguration(path);
            final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);

            FileSystem fs = path.getFileSystem(configuration);

            if (inputFormat instanceof SymlinkTextInputFormat) {
                JobConf jobConf = new JobConf(configuration);
                FileInputFormat.setInputPaths(jobConf, path);
                InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
                for (InputSplit rawSplit : splits) {
                    FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit)
                            .getTargetSplit();

                    // get the filesystem for the target path -- it may be a different hdfs instance
                    FileSystem targetFilesystem = split.getPath().getFileSystem(configuration);
                    FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath());
                    hiveSplitSource.addToQueue(createHiveSplits(partitionName, fileStatus,
                            targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(),
                                    split.getLength()),
                            split.getStart(), split.getLength(), schema, partitionKeys, false, session));
                }// w w w.j av a  2  s. c  o m
                continue;
            }

            // TODO: this is currently serial across all partitions and should be done in suspendingExecutor
            if (bucket.isPresent()) {
                Optional<FileStatus> bucketFile = getBucketFile(bucket.get(), fs, path);
                if (bucketFile.isPresent()) {
                    FileStatus file = bucketFile.get();
                    BlockLocation[] blockLocations = fs.getFileBlockLocations(file, 0, file.getLen());
                    boolean splittable = isSplittable(inputFormat, fs, file.getPath());

                    hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0,
                            file.getLen(), schema, partitionKeys, splittable, session));
                    continue;
                }
            }

            // Acquire semaphore so that we only have a fixed number of outstanding partitions being processed asynchronously
            // NOTE: there must not be any calls that throw in the space between acquiring the semaphore and setting the Future
            // callback to release it. Otherwise, we will need a try-finally block around this section.
            try {
                semaphore.acquire();
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                return;
            }

            ListenableFuture<Void> partitionFuture = createAsyncWalker(fs, suspendingExecutor).beginWalk(path,
                    new FileStatusCallback() {
                        @Override
                        public void process(FileStatus file, BlockLocation[] blockLocations) {
                            try {
                                boolean splittable = isSplittable(inputFormat,
                                        file.getPath().getFileSystem(configuration), file.getPath());

                                hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations,
                                        0, file.getLen(), schema, partitionKeys, splittable, session));
                            } catch (IOException e) {
                                hiveSplitSource.fail(e);
                            }
                        }
                    });

            // release the semaphore when the partition finishes
            Futures.addCallback(partitionFuture, new FutureCallback<Void>() {
                @Override
                public void onSuccess(Void result) {
                    semaphore.release();
                }

                @Override
                public void onFailure(Throwable t) {
                    semaphore.release();
                }
            });

            futureBuilder.add(partitionFuture);
        }

        // when all partitions finish, mark the queue as finished
        Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() {
            @Override
            public void onSuccess(List<Void> result) {
                hiveSplitSource.finished();
            }

            @Override
            public void onFailure(Throwable t) {
                hiveSplitSource.fail(t);
            }
        });
    } catch (Throwable e) {
        hiveSplitSource.fail(e);
        Throwables.propagateIfInstanceOf(e, Error.class);
    }
}