Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java

License:Apache License

/**
 * Gets block locations of input files sorted
 * by the total number of occurrences.//from w ww  .j a  v a2  s.  c  o  m
 *
 * @param fs Hadoop filesystem handle
 * @param inFiles array of input files
 * @return sorted String array
 */
public static String[] getSortedHosts(FileSystem fs, Path[] inFiles) throws IOException {
    final Map<String, Integer> hostMap = new HashMap<String, Integer>();
    for (Path inFile : inFiles) {
        FileStatus s = fs.getFileStatus(inFile);
        BlockLocation[] locations = fs.getFileBlockLocations(s, 0, s.getLen());
        for (BlockLocation location : locations) {
            String[] hosts = location.getHosts();
            for (String host : hosts) {
                if (!hostMap.containsKey(host)) {
                    hostMap.put(host, 1);
                    continue;
                }
                hostMap.put(host, hostMap.get(host) + 1);
            }
        }
    }
    // sort hosts by number of references to blocks of input files
    List<String> hosts = new ArrayList<String>();
    hosts.addAll(hostMap.keySet());
    Collections.sort(hosts, new Comparator<String>() {
        @Override
        public int compare(String host1, String host2) {
            return hostMap.get(host2) - hostMap.get(host1);
        }
    });
    return hosts.toArray(new String[0]);

}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** 
 * Generate the list of files and make them into FileSplits.
 *//*w ww .  j a va2  s  . c o m*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new MyFileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());

    String p = job.getConfiguration().get("mapred.fairscheduler.pool");
    int max = Integer.parseInt(p.substring(p.indexOf("l") + 1));

    if (splits.size() <= max)
        job.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        job.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;
}

From source file:hsyndicate.tools.BlockLocations.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path p = new Path(args[0]);
    Configuration conf = new Configuration();
    FileSystem fs = p.getFileSystem(conf);
    FileStatus f = fs.getFileStatus(p);/*ww w. j av  a2 s .c o  m*/
    BlockLocation[] bla = fs.getFileBlockLocations(f, 0, f.getLen());

    System.out.println("File : " + f.getPath().toString());
    for (BlockLocation bl : bla) {
        System.out.println("Offset : " + bl.getOffset());
        System.out.println("Len : " + bl.getLength());
        System.out.println("Hosts : " + makeCommaSeparated(bl.getHosts()));
        System.out.println("Names : " + makeCommaSeparated(bl.getNames()));
        System.out.println("TopologyPaths : " + makeCommaSeparated(bl.getTopologyPaths()));
    }
}

From source file:IndexService.IColumnInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;//from  w  w  w  .  j  a v  a2 s.c  om
    FileSystem fs = FileSystem.get(job);
    List<IColumnInputSplit> splits = new ArrayList<IColumnInputSplit>();
    HashMap<String, FileStatus> files = new HashMap<String, FileStatus>();
    String[] inputfiles = job.getStrings("mapred.input.dir");

    for (String file : inputfiles) {
        FileStatus[] fss = fs.globStatus(new Path(file + "_idx*"));
        FileStatus status = null;
        long length = 0;
        for (FileStatus ss : fss) {
            if (ss.getLen() > length) {
                length = ss.getLen();
                status = ss;
            }
        }
        files.put(file, status);
    }

    for (String filekey : files.keySet()) {
        FileStatus file = files.get(filekey);
        Path path = file.getPath();
        Path keypath = new Path(filekey);
        long length = file.getLen();

        tmpPath = keypath;

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length <= 1) {
            IColumnInputSplit split = new IColumnInputSplit(keypath, length, blkLocations[0].getHosts());
            splits.add(split);
        } else {

            String filename = path.toString();
            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();

            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                IColumnInputSplit split = new IColumnInputSplit(keypath, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }
    }

    if (splits.size() == 0) {
        splits.add(new IColumnInputSplit(tmpPath, 0, 0, 0, new String[0]));
    }

    System.out.println("Total # of splits: " + splits.size());
    return splits.toArray(new IColumnInputSplit[splits.size()]);

}

From source file:IndexService.IFormatInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;//from  w  w w.  j ava  2 s  .c om
    List<IFormatInputSplit> splits = new ArrayList<IFormatInputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length <= 1) {
            IFormatInputSplit split = new IFormatInputSplit(path, length, blkLocations[0].getHosts());
            splits.add(split);
        } else {
            String filename = path.toString();
            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();
            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                IFormatInputSplit split = new IFormatInputSplit(path, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }
        tmpPath = path;
    }

    if (splits.size() == 0) {
        splits.add(new IFormatInputSplit(tmpPath, 0, 0, 0, new String[0]));
        return splits.toArray(new IFormatInputSplit[splits.size()]);
    }

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new IFormatInputSplit[splits.size()]);

}

From source file:IndexService.IndexMergeIFormatInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;//from w w  w .  j  a v  a 2  s.  c  o  m
    List<IndexMergeIFormatSplit> splits = new ArrayList<IndexMergeIFormatSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length <= 1) {
            IndexMergeIFormatSplit split = new IndexMergeIFormatSplit(path, length, blkLocations[0].getHosts());
            splits.add(split);
        } else {

            String filename = path.toString();
            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();

            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                IndexMergeIFormatSplit split = new IndexMergeIFormatSplit(path, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }

        tmpPath = path;
    }

    if (splits.size() == 0) {
        splits.add(new IndexMergeIFormatSplit(tmpPath, 0, null, 0, null));
        return splits.toArray(new IndexMergeIFormatSplit[splits.size()]);
    }

    System.out.println("Total # of splits: " + splits.size());
    return splits.toArray(new IndexMergeIFormatSplit[splits.size()]);
}

From source file:input_format.MyFileInputFormat.java

License:Open Source License

/** 
 * Generate the list of files and make them into FileSplits.
 *//*  w w  w  .ja v a2  s. c o m*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new MyFileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());

    //String p=job.getConfiguration().get("mapred.fairscheduler.pool");
    //int max = Integer.parseInt(p.substring(p.indexOf("l")+1));
    int max = 1000;

    if (splits.size() <= max)
        job.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        job.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;
}

From source file:io.hops.erasure_coding.BaseEncodingManager.java

License:Apache License

/**
 * RAID an individual file/* ww  w. j  ava  2  s.c  o  m*/
 */
public static boolean doFileRaid(Configuration conf, Path sourceFile, Path parityPath, Codec codec,
        Statistics statistics, Progressable reporter, int targetRepl, int metaRepl) throws IOException {
    FileSystem srcFs = sourceFile.getFileSystem(conf);
    FileStatus sourceStatus = srcFs.getFileStatus(sourceFile);

    // extract block locations from File system
    BlockLocation[] locations = srcFs.getFileBlockLocations(sourceFile, 0, sourceStatus.getLen());
    // if the file has fewer than 2 blocks, then nothing to do
    if (locations.length <= 2) {
        return false;
    }

    // add up the raw disk space occupied by this file
    long diskSpace = 0;
    for (BlockLocation l : locations) {
        diskSpace += (l.getLength() * sourceStatus.getReplication());
    }
    statistics.numProcessedBlocks += locations.length;
    statistics.processedSize += diskSpace;

    // generate parity file
    generateParityFile(conf, sourceStatus, targetRepl, reporter, srcFs, parityPath, codec, locations.length,
            sourceStatus.getReplication(), metaRepl, sourceStatus.getBlockSize());
    if (srcFs.setReplication(sourceFile, (short) targetRepl) == false) {
        LOG.info("Error in reducing replication of " + sourceFile + " to " + targetRepl);
        statistics.remainingSize += diskSpace;
        return false;
    }
    ;

    diskSpace = 0;
    for (BlockLocation l : locations) {
        diskSpace += (l.getLength() * targetRepl);
    }
    statistics.remainingSize += diskSpace;

    // the metafile will have this many number of blocks
    int numMeta = locations.length / codec.stripeLength;
    if (locations.length % codec.stripeLength != 0) {
        numMeta++;
    }

    // we create numMeta for every file. This metablock has metaRepl # replicas.
    // the last block of the metafile might not be completely filled up, but we
    // ignore that for now.
    statistics.numMetaBlocks += (numMeta * metaRepl);
    statistics.metaSize += (numMeta * metaRepl * sourceStatus.getBlockSize());
    return true;
}

From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    //It generates the splits.
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
        Path filePath = file.getPath();

        //Calculates the content (array of double) length in bytes.
        FileSystem fs = filePath.getFileSystem(job.getConfiguration());
        FSDataInputStream dis = fs.open(filePath);
        JMATReader _matReader = new JMATReader(dis);
        JMATInfo _matdata = _matReader.getInfo();

        long length = _matdata.dataNumOfItems * MLDataType.miDOUBLE.bytes; //Content length.
        long lContentByteOffset = dis.getPos();

        _matReader.close();/*from w ww. ja  v a2 s  . c o m*/
        _matReader = null;
        dis = null;

        //Zero bytes, empty file split.
        if (length <= 0) {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(filePath, 0, length, new String[0]));
        }

        //Split the data.
        if (length > 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                blkLocations = fs.getFileBlockLocations(file, lContentByteOffset, length);
            }

            boolean isSplittable = isSplitable(job, filePath);
            LOG.debug("Current file to process " + filePath.getName() + ". Splittable? " + isSplittable);
            if (isSplittable) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;

                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(
                            makeSplit(filePath, lBlockByteStart, splitSize, blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                } //EndWhile.

                if (bytesRemaining != 0) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;
                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(makeSplit(filePath, lBlockByteStart, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(filePath, lContentByteOffset, length, blkLocations[0].getHosts()));
            }
        }
    } //EndFor.

    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexSplit.java

License:Open Source License

private String[] findBestLocations(Path[] indexFilePaths, Configuration conf) throws IOException {
    Hashtable<String, MutableInteger> blkLocations = new Hashtable<String, MutableInteger>();
    for (Path path : indexFilePaths) {
        FileSystem fs = path.getFileSystem(conf);
        FileStatus fileStatus = fs.getFileStatus(path);
        BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        for (BlockLocation location : fileBlockLocations) {
            for (String host : location.getHosts()) {
                MutableInteger cnt = blkLocations.get(host);
                if (cnt == null) {
                    blkLocations.put(host, new MutableInteger(1));
                } else {
                    cnt.increase();// ww w  .ja v  a 2s  .c om
                }
            }
        }
    }

    List<String> blkLocationsArr = new ArrayList<String>();
    for (String key : blkLocations.keySet()) {
        blkLocationsArr.add(key);
    }

    if (blkLocationsArr.size() == 0) {
        return new String[] { "localhost" };
    } else {
        return blkLocationsArr.toArray(new String[0]);
    }
}