Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext job) throws IOException {
    final Configuration configuration = job.getConfiguration();

    final List<InputSplit> result = Lists.newArrayList();

    final List<FileStatus> files = listStatus(job);

    LOG.debug("Initial file list: {} {}", files.size(), files);

    for (final FileStatus fileStatus : files) {
        final Path dataFile = fileStatus.getPath();
        final FileSystem fileSystem = dataFile.getFileSystem(configuration);
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0,
                fileStatus.getLen());//w ww  .  ja v  a  2  s .  c  om

        // Data file, try to split if the .index file was found
        final SSTableIndexIndex index = indexes.get(dataFile);
        if (index == null) {
            throw new IOException("Index not found for " + dataFile);
        }

        for (final SSTableIndexIndex.Chunk chunk : index.getOffsets()) {
            // This isn't likely to work well because we are dealing with the index into uncompressed data...
            final int blockIndex = getBlockIndex(blockLocations,
                    chunk.getStart() / COMPRESSION_RATIO_ASSUMPTION);
            final SSTableSplit split = new SSTableSplit(dataFile, chunk.getStart(), chunk.getEnd(),
                    chunk.getEnd() - chunk.getStart(), blockLocations[blockIndex].getHosts());
            result.add(split);
        }
    }

    LOG.debug("Splits calculated: {} {}", result.size(), result);

    return result;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.HoplogUtil.java

License:Apache License

/**
 * Creates a mapping of hoplog to hdfs blocks on disk
 * /*  ww w  .ja  va  2 s. co m*/
 * @param files
 *          list of hoplog file status objects
 * @return array of hdfs block location objects associated with a hoplog
 * @throws IOException
 */
public static Map<FileStatus, BlockLocation[]> getBlocks(Configuration config, Collection<FileStatus> files)
        throws IOException {
    Map<FileStatus, BlockLocation[]> blocks = new HashMap<FileStatus, BlockLocation[]>();
    if (files == null || files.isEmpty()) {
        return blocks;
    }

    FileSystem fs = files.iterator().next().getPath().getFileSystem(config);

    for (FileStatus hoplog : files) {
        long length = hoplog.getLen();
        BlockLocation[] fileBlocks = fs.getFileBlockLocations(hoplog, 0, length);
        blocks.put(hoplog, fileBlocks);
    }

    return blocks;
}

From source file:com.google.mr4c.sources.URIDataFileSource.java

License:Open Source License

@Override
public BlockLocation[] getBlockLocation() throws IOException {
    URI uri = ContentFactories.scrubURI(m_uri);
    FileSystem fs = FileSystem.get(uri, s_config);
    Path path = new Path(uri);
    FileStatus status = fs.getFileStatus(path);
    return fs.getFileBlockLocations(status, 0, status.getBlockSize());
}

From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java

License:Apache License

protected BlockLocation[] getFileBlockLocations(FileSystem fs, FileStatus stat) throws IOException {
    return fs.getFileBlockLocations(stat, 0, stat.getLen());
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java

License:Open Source License

/**
 * Get the list of hostnames where the input split is located.
 *//*from  ww  w .  j a  va  2s  . c o  m*/
@Override
public String[] getLocations() throws IOException {
    //Timing time = new Timing();
    //time.start();

    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);

    //read task string
    LongWritable key = new LongWritable();
    Text value = new Text();
    RecordReader<LongWritable, Text> reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL);
    reader.next(key, value);
    reader.close();

    //parse task
    Task t = Task.parseCompactString(value.toString());

    //get all locations
    HashMap<String, Integer> hosts = new HashMap<String, Integer>();

    if (t.getType() == TaskType.SET) {
        for (IntObject val : t.getIterations()) {
            String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }
    } else //TaskType.RANGE
    {
        //since this is a serial process, we use just the first iteration
        //as a heuristic for location information
        long lFrom = t.getIterations().get(0).getLongValue();
        long lTo = t.getIterations().get(1).getLongValue();
        for (long li : new long[] { lFrom, lTo }) {
            String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }

        /*
        int lFrom  = t.getIterations().get(0).getIntValue();
        int lTo    = t.getIterations().get(1).getIntValue();
        int lIncr  = t.getIterations().get(2).getIntValue();            
        for( int i=lFrom; i<=lTo; i+=lIncr )
        {
           String fname = _fname+"/"+String.valueOf( ((i-_offset)/_blen+_offset) );
           FileSystem fs = FileSystem.get(job);
           FileStatus status = fs.getFileStatus(new Path(fname)); 
           BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
           for( BlockLocation bl : tmp1 )
              countHosts(hosts, bl.getHosts());
        }*/
    }

    //System.out.println("Get locations "+time.stop()+"");

    //majority consensus on top host
    return getTopHosts(hosts);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java

License:Apache License

protected BlockLocation[] getFileBlockLocations(FileSystem fs, FileStatus stat) throws IOException {
    if (stat instanceof LocatedFileStatus) {
        return ((LocatedFileStatus) stat).getBlockLocations();
    }/*from   w w w .  j  av a2 s . c om*/
    return fs.getFileBlockLocations(stat, 0, stat.getLen());
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*from   w w  w .  j  av  a  2  s .c  om*/
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = Stopwatch.createStarted();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsed(TimeUnit.MILLISECONDS));
    }
    return splits;
}

From source file:com.linkedin.cubert.io.rubix.RubixInputSplit.java

License:Open Source License

@Override
public String[] getLocations() throws IOException, InterruptedException {
    if (hostnames == null) {
        /* Obtain the FileSystem object and get the FileStatus objects for the split */
        FileSystem fileSystem = FileSystem.get(conf);
        FileStatus fileStatus = fileSystem.getFileStatus(filename);
        /*/*from w w w.  ja  va2  s.  c o m*/
         * Obtain the Block locations for the split. This also provides the offset and
         * length information for each block
         */
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, offset, length);
        /**
         * Collect all hosts in a map and populate the number of bytes to be read from
         * each host
         */
        Long l;
        Map<String, Long> hostMap = new HashMap<String, Long>();
        for (BlockLocation bl : blockLocations) {
            final long start = bl.getOffset() < offset ? offset : bl.getOffset();
            final long end = (offset + length) < (bl.getOffset() + bl.getLength()) ? offset + length
                    : bl.getOffset() + bl.getLength();
            final long nRelevantBytes = end - start;
            for (String host : bl.getHosts()) {
                hostMap.put(host, ((l = hostMap.get(host)) == null ? 0 : l) + nRelevantBytes);
            }
        }
        /* Sort them in decreasing order of maximum number of relevant bytes */
        final Set<Map.Entry<String, Long>> entries = hostMap.entrySet();
        final Map.Entry<String, Long>[] hostLengthPairs = entries.toArray(new Map.Entry[entries.size()]);

        Arrays.sort(hostLengthPairs, new Comparator<Map.Entry<String, Long>>() {
            @Override
            public int compare(Map.Entry<String, Long> e1, Map.Entry<String, Long> e2) {
                return (int) (e2.getValue() - e1.getValue());
            }
        });

        /* Populate the hostnames object */
        final int nHost = Math.min(hostLengthPairs.length, MAX_LOCATIONS);
        hostnames = new String[nHost];
        for (int i = 0; i < nHost; ++i) {
            hostnames[i] = hostLengthPairs[i].getKey();
        }
    }
    return hostnames;
}

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Configuration conf = job.getConfiguration();
    try {/* w  ww. j ava2  s . c o m*/
        List<FileStatus> files = listStatus(job);

        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        for (FileStatus child : files) {
            Path path = child.getPath();
            FileSystem fs = path.getFileSystem(conf);
            // length is 0 for dir according to FSDirectory.java in 0.20
            // however, w/ Hadoop2, dir in local fs has non-zero length
            long length = child.getLen();
            BlockLocation[] blkLocations = null;
            if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) {
                blkLocations = fs.getFileBlockLocations(child, 0, length);
            } else if (length != 0) {
                throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString());
            }

            if ((length != 0) && isSplitable(job, path)) {
                long blockSize = child.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkLocations.length - 1].getHosts()));
                }
            } else if (length != 0) {
                splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
            } else {
                // Create empty hosts array for zero length files
                splits.add(new FileSplit(path, 0, length, new String[0]));
            }
        }
    } catch (InvalidInputException ex) {
        String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY);
        String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*");
        throw new IOException("No input files found with the specified input path " + inPath
                + " and input file pattern " + pattern, ex);
    }

    PathFilter jobFilter = getInputPathFilter(job);
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);
    // take a second pass of the splits generated to extract files from
    // directories
    int count = 0;
    // flatten directories until reaching SPLIT_COUNT_LIMIT
    while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) {
        FileSplit split = (FileSplit) splits.get(count);
        Path file = split.getPath();
        FileSystem fs = file.getFileSystem(conf);
        FileStatus status = fs.getFileStatus(file);
        if (status.isDirectory()) {
            FileStatus[] children = fs.listStatus(file, inputFilter);
            if (children.length + count < SPLIT_COUNT_LIMIT) {
                splits.remove(count);
                for (FileStatus stat : children) {
                    FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null);
                    splits.add(child);
                }
            } else {
                count++;
            }
        } else {
            count++;
        }
    }
    return splits;
}

From source file:com.marklogic.mapreduce.ForestInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) { // stand directories
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        FileStatus children[] = fs.listStatus(path);
        FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null,
                timestampsStatus = null;
        boolean obsolete = false;
        for (FileStatus child : children) {
            String fileName = child.getPath().getName();
            if (fileName.equals("TreeData")) { // inside a stand
                treeDataStatus = child;/* w ww .  ja  v a2s. c  o m*/
            } else if (fileName.equals("TreeIndex")) {
                treeIndexStatus = child;
            } else if (fileName.equals("Ordinals")) {
                ordinalsStatus = child;
            } else if (fileName.equals("Timestamps")) {
                timestampsStatus = child;
            } else if (fileName.equals("Obsolete")) {
                obsolete = true;
                break;
            }
        }
        if (obsolete) {
            LOG.warn("Obsolete file found.  The forest is either live or isn't "
                    + "dismounted cleanly.  Ignoring stand " + path);
            break;
        }
        if (treeDataStatus == null) {
            throw new RuntimeException("TreeData file not found.");
        } else if (treeIndexStatus == null) {
            throw new RuntimeException("TreeIndex file not found.");
        } else if (ordinalsStatus == null) {
            throw new RuntimeException("Ordinals file not found.");
        } else if (timestampsStatus == null) {
            throw new RuntimeException("Timestamps file not found.");
        }
        long treeDataSize = treeDataStatus.getLen();
        if (treeDataSize == 0) {
            // unexpected, give up this stand
            LOG.warn("Found empty TreeData file.  Skipping...");
            continue; // skipping this stand
        }
        Path treeDataPath = treeDataStatus.getPath();
        long blockSize = treeDataStatus.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);
        // make splits based on TreeIndex
        FSDataInputStream is = fs.open(treeIndexStatus.getPath());
        BiendianDataInputStream in = new BiendianDataInputStream(is);
        int prevDocid = -1, docid = -1, position = 0;
        long prevOffset = -1L, offset = 0, splitStart = 0;
        BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize);
        try {
            for (;; ++position) {
                try {
                    docid = in.readInt();
                    in.readInt();
                    offset = in.readLong();
                } catch (EOFException e) {
                    break;
                }
                int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize);
                if (comp > 0) {
                    throw new RuntimeException("TreeIndex offset is out of bound: position = " + position
                            + ", offset = " + offset + ", treeDataSize = " + treeDataSize);
                }
                if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) {
                    throw new RuntimeException("docid out of order, position = " + position + ", docid = "
                            + docid + ", prevDocid = " + prevDocid);
                }
                prevDocid = docid;
                if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) {
                    throw new RuntimeException("offset out of order, position = " + position + ", offset = "
                            + offset + ", prevOffset = " + prevOffset);
                }
                long splitLen = offset - splitStart;
                if (splitLen == splitSize || (splitLen > splitSize
                        && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) {
                    int blkIndex = getBlockIndex(blkLocations, offset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid="
                                + docid);
                    }
                    splits.add(split);
                    splitStart = offset;
                } else if (splitLen > splitSize) {
                    int blkIndex = getBlockIndex(blkLocations, prevOffset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart)
                                + " last docid=" + docid);
                    }
                    splits.add(split);
                    splitStart = prevOffset;
                }
            }
        } finally {
            in.close();
        }
        if (offset > splitStart) {
            int blkIndex = getBlockIndex(blkLocations, offset - 1);
            InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart,
                    blkLocations[blkIndex].getHosts());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart)
                        + " last docid=" + docid);
            }

            splits.add(split);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Made " + splits.size() + " splits.");
    }

    return splits;
}