Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException 

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext job) throws IOException {
    final Configuration configuration = job.getConfiguration();

    final List<InputSplit> result = Lists.newArrayList();

    final List<FileStatus> files = listStatus(job);

    LOG.debug("Initial file list: {} {}", files.size(), files);

    for (final FileStatus fileStatus : files) {
        final Path dataFile = fileStatus.getPath();
        final FileSystem fileSystem = dataFile.getFileSystem(configuration);
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0,
                fileStatus.getLen());//w ww  .  ja v  a  2  s .  c  om

        // Data file, try to split if the .index file was found
        final SSTableIndexIndex index = indexes.get(dataFile);
        if (index == null) {
            throw new IOException("Index not found for " + dataFile);
        }

        for (final SSTableIndexIndex.Chunk chunk : index.getOffsets()) {
            // This isn't likely to work well because we are dealing with the index into uncompressed data...
            final int blockIndex = getBlockIndex(blockLocations,
                    chunk.getStart() / COMPRESSION_RATIO_ASSUMPTION);
            final SSTableSplit split = new SSTableSplit(dataFile, chunk.getStart(), chunk.getEnd(),
                    chunk.getEnd() - chunk.getStart(), blockLocations[blockIndex].getHosts());
            result.add(split);
        }
    }

    LOG.debug("Splits calculated: {} {}", result.size(), result);

    return result;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.HoplogUtil.java

License:Apache License

/**
 * Creates a mapping of hoplog to hdfs blocks on disk
 * /*  ww w  .ja  va  2 s. co m*/
 * @param files
 *          list of hoplog file status objects
 * @return array of hdfs block location objects associated with a hoplog
 * @throws IOException
 */
public static Map<FileStatus, BlockLocation[]> getBlocks(Configuration config, Collection<FileStatus> files)
        throws IOException {
    Map<FileStatus, BlockLocation[]> blocks = new HashMap<FileStatus, BlockLocation[]>();
    if (files == null || files.isEmpty()) {
        return blocks;
    }

    FileSystem fs = files.iterator().next().getPath().getFileSystem(config);

    for (FileStatus hoplog : files) {
        long length = hoplog.getLen();
        BlockLocation[] fileBlocks = fs.getFileBlockLocations(hoplog, 0, length);
        blocks.put(hoplog, fileBlocks);
    }

    return blocks;
}

From source file:com.google.mr4c.sources.URIDataFileSource.java

License:Open Source License

@Override
public BlockLocation[] getBlockLocation() throws IOException {
    URI uri = ContentFactories.scrubURI(m_uri);
    FileSystem fs = FileSystem.get(uri, s_config);
    Path path = new Path(uri);
    FileStatus status = fs.getFileStatus(path);
    return fs.getFileBlockLocations(status, 0, status.getBlockSize());
}

From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java

License:Apache License

protected BlockLocation[] getFileBlockLocations(FileSystem fs, FileStatus stat) throws IOException {
    return fs.getFileBlockLocations(stat, 0, stat.getLen());
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java

License:Open Source License

/**
 * Get the list of hostnames where the input split is located.
 *//*from  ww  w .  j a  va  2s  . c o  m*/
@Override
public String[] getLocations() throws IOException {
    //Timing time = new Timing();
    //time.start();

    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);

    //read task string
    LongWritable key = new LongWritable();
    Text value = new Text();
    RecordReader<LongWritable, Text> reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL);
    reader.next(key, value);
    reader.close();

    //parse task
    Task t = Task.parseCompactString(value.toString());

    //get all locations
    HashMap<String, Integer> hosts = new HashMap<String, Integer>();

    if (t.getType() == TaskType.SET) {
        for (IntObject val : t.getIterations()) {
            String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }
    } else //TaskType.RANGE
    {
        //since this is a serial process, we use just the first iteration
        //as a heuristic for location information
        long lFrom = t.getIterations().get(0).getLongValue();
        long lTo = t.getIterations().get(1).getLongValue();
        for (long li : new long[] { lFrom, lTo }) {
            String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }

        /*
        int lFrom  = t.getIterations().get(0).getIntValue();
        int lTo    = t.getIterations().get(1).getIntValue();
        int lIncr  = t.getIterations().get(2).getIntValue();            
        for( int i=lFrom; i<=lTo; i+=lIncr )
        {
           String fname = _fname+"/"+String.valueOf( ((i-_offset)/_blen+_offset) );
           FileSystem fs = FileSystem.get(job);
           FileStatus status = fs.getFileStatus(new Path(fname)); 
           BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
           for( BlockLocation bl : tmp1 )
              countHosts(hosts, bl.getHosts());
        }*/
    }

    //System.out.println("Get locations "+time.stop()+"");

    //majority consensus on top host
    return getTopHosts(hosts);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java

License:Apache License

protected BlockLocation[] getFileBlockLocations(FileSystem fs, FileStatus stat) throws IOException {
    if (stat instanceof LocatedFileStatus) {
        return ((LocatedFileStatus) stat).getBlockLocations();
    }/*from   w w w .  j  av a2 s . c om*/
    return fs.getFileBlockLocations(stat, 0, stat.getLen());
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*from   w w  w .  j  av  a  2  s .c  om*/
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = Stopwatch.createStarted();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsed(TimeUnit.MILLISECONDS));
    }
    return splits;
}

From source file:com.linkedin.cubert.io.rubix.RubixInputSplit.java

License:Open Source License

@Override
public String[] getLocations() throws IOException, InterruptedException {
    if (hostnames == null) {
        /* Obtain the FileSystem object and get the FileStatus objects for the split */
        FileSystem fileSystem = FileSystem.get(conf);
        FileStatus fileStatus = fileSystem.getFileStatus(filename);
        /*/*from w w w.  ja  va2  s.  c o m*/
         * Obtain the Block locations for the split. This also provides the offset and
         * length information for each block
         */
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, offset, length);
        /**
         * Collect all hosts in a map and populate the number of bytes to be read from
         * each host
         */
        Long l;
        Map<String, Long> hostMap = new HashMap<String, Long>();
        for (BlockLocation bl : blockLocations) {
            final long start = bl.getOffset() < offset ? offset : bl.getOffset();
            final long end = (offset + length) < (bl.getOffset() + bl.getLength()) ? offset + length
                    : bl.getOffset() + bl.getLength();
            final long nRelevantBytes = end - start;
            for (String host : bl.getHosts()) {
                hostMap.put(host, ((l = hostMap.get(host)) == null ? 0 : l) + nRelevantBytes);
            }
        }
        /* Sort them in decreasing order of maximum number of relevant bytes */
        final Set<Map.Entry<String, Long>> entries = hostMap.entrySet();
        final Map.Entry<String, Long>[] hostLengthPairs = entries.toArray(new Map.Entry[entries.size()]);

        Arrays.sort(hostLengthPairs, new Comparator<Map.Entry<String, Long>>() {
            @Override
            public int compare(Map.Entry<String, Long> e1, Map.Entry<String, Long> e2) {
                return (int) (e2.getValue() - e1.getValue());
            }
        });

        /* Populate the hostnames object */
        final int nHost = Math.min(hostLengthPairs.length, MAX_LOCATIONS);
        hostnames = new String[nHost];
        for (int i = 0; i < nHost; ++i) {
            hostnames[i] = hostLengthPairs[i].getKey();
        }
    }
    return hostnames;
}

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Configuration conf = job.getConfiguration();
    try {/* w  ww. j ava2  s . c o m*/
        List<FileStatus> files = listStatus(job);

        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        for (FileStatus child : files) {
            Path path = child.getPath();
            FileSystem fs = path.getFileSystem(conf);
            // length is 0 for dir according to FSDirectory.java in 0.20
            // however, w/ Hadoop2, dir in local fs has non-zero length
            long length = child.getLen();
            BlockLocation[] blkLocations = null;
            if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) {
                blkLocations = fs.getFileBlockLocations(child, 0, length);
            } else if (length != 0) {
                throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString());
            }

            if ((length != 0) && isSplitable(job, path)) {
                long blockSize = child.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkLocations.length - 1].getHosts()));
                }
            } else if (length != 0) {
                splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
            } else {
                // Create empty hosts array for zero length files
                splits.add(new FileSplit(path, 0, length, new String[0]));
            }
        }
    } catch (InvalidInputException ex) {
        String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY);
        String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*");
        throw new IOException("No input files found with the specified input path " + inPath
                + " and input file pattern " + pattern, ex);
    }

    PathFilter jobFilter = getInputPathFilter(job);
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);
    // take a second pass of the splits generated to extract files from
    // directories
    int count = 0;
    // flatten directories until reaching SPLIT_COUNT_LIMIT
    while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) {
        FileSplit split = (FileSplit) splits.get(count);
        Path file = split.getPath();
        FileSystem fs = file.getFileSystem(conf);
        FileStatus status = fs.getFileStatus(file);
        if (status.isDirectory()) {
            FileStatus[] children = fs.listStatus(file, inputFilter);
            if (children.length + count < SPLIT_COUNT_LIMIT) {
                splits.remove(count);
                for (FileStatus stat : children) {
                    FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null);
                    splits.add(child);
                }
            } else {
                count++;
            }
        } else {
            count++;
        }
    }
    return splits;
}

From source file:com.marklogic.mapreduce.ForestInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) { // stand directories
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        FileStatus children[] = fs.listStatus(path);
        FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null,
                timestampsStatus = null;
        boolean obsolete = false;
        for (FileStatus child : children) {
            String fileName = child.getPath().getName();
            if (fileName.equals("TreeData")) { // inside a stand
                treeDataStatus = child;/* w ww .  ja  v a2s. c  o m*/
            } else if (fileName.equals("TreeIndex")) {
                treeIndexStatus = child;
            } else if (fileName.equals("Ordinals")) {
                ordinalsStatus = child;
            } else if (fileName.equals("Timestamps")) {
                timestampsStatus = child;
            } else if (fileName.equals("Obsolete")) {
                obsolete = true;
                break;
            }
        }
        if (obsolete) {
            LOG.warn("Obsolete file found.  The forest is either live or isn't "
                    + "dismounted cleanly.  Ignoring stand " + path);
            break;
        }
        if (treeDataStatus == null) {
            throw new RuntimeException("TreeData file not found.");
        } else if (treeIndexStatus == null) {
            throw new RuntimeException("TreeIndex file not found.");
        } else if (ordinalsStatus == null) {
            throw new RuntimeException("Ordinals file not found.");
        } else if (timestampsStatus == null) {
            throw new RuntimeException("Timestamps file not found.");
        }
        long treeDataSize = treeDataStatus.getLen();
        if (treeDataSize == 0) {
            // unexpected, give up this stand
            LOG.warn("Found empty TreeData file.  Skipping...");
            continue; // skipping this stand
        }
        Path treeDataPath = treeDataStatus.getPath();
        long blockSize = treeDataStatus.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);
        // make splits based on TreeIndex
        FSDataInputStream is = fs.open(treeIndexStatus.getPath());
        BiendianDataInputStream in = new BiendianDataInputStream(is);
        int prevDocid = -1, docid = -1, position = 0;
        long prevOffset = -1L, offset = 0, splitStart = 0;
        BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize);
        try {
            for (;; ++position) {
                try {
                    docid = in.readInt();
                    in.readInt();
                    offset = in.readLong();
                } catch (EOFException e) {
                    break;
                }
                int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize);
                if (comp > 0) {
                    throw new RuntimeException("TreeIndex offset is out of bound: position = " + position
                            + ", offset = " + offset + ", treeDataSize = " + treeDataSize);
                }
                if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) {
                    throw new RuntimeException("docid out of order, position = " + position + ", docid = "
                            + docid + ", prevDocid = " + prevDocid);
                }
                prevDocid = docid;
                if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) {
                    throw new RuntimeException("offset out of order, position = " + position + ", offset = "
                            + offset + ", prevOffset = " + prevOffset);
                }
                long splitLen = offset - splitStart;
                if (splitLen == splitSize || (splitLen > splitSize
                        && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) {
                    int blkIndex = getBlockIndex(blkLocations, offset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid="
                                + docid);
                    }
                    splits.add(split);
                    splitStart = offset;
                } else if (splitLen > splitSize) {
                    int blkIndex = getBlockIndex(blkLocations, prevOffset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart)
                                + " last docid=" + docid);
                    }
                    splits.add(split);
                    splitStart = prevOffset;
                }
            }
        } finally {
            in.close();
        }
        if (offset > splitStart) {
            int blkIndex = getBlockIndex(blkLocations, offset - 1);
            InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart,
                    blkLocations[blkIndex].getHosts());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart)
                        + " last docid=" + docid);
            }

            splits.add(split);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Made " + splits.size() + " splits.");
    }

    return splits;
}