Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException 

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java

License:Apache License

/**
 * Gets block locations of input files sorted
 * by the total number of occurrences.//from w ww  .j a  v a2  s.  c  o  m
 *
 * @param fs Hadoop filesystem handle
 * @param inFiles array of input files
 * @return sorted String array
 */
public static String[] getSortedHosts(FileSystem fs, Path[] inFiles) throws IOException {
    final Map<String, Integer> hostMap = new HashMap<String, Integer>();
    for (Path inFile : inFiles) {
        FileStatus s = fs.getFileStatus(inFile);
        BlockLocation[] locations = fs.getFileBlockLocations(s, 0, s.getLen());
        for (BlockLocation location : locations) {
            String[] hosts = location.getHosts();
            for (String host : hosts) {
                if (!hostMap.containsKey(host)) {
                    hostMap.put(host, 1);
                    continue;
                }
                hostMap.put(host, hostMap.get(host) + 1);
            }
        }
    }
    // sort hosts by number of references to blocks of input files
    List<String> hosts = new ArrayList<String>();
    hosts.addAll(hostMap.keySet());
    Collections.sort(hosts, new Comparator<String>() {
        @Override
        public int compare(String host1, String host2) {
            return hostMap.get(host2) - hostMap.get(host1);
        }
    });
    return hosts.toArray(new String[0]);

}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** 
 * Generate the list of files and make them into FileSplits.
 *//*w ww .  j a va2  s  . c o m*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new MyFileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());

    String p = job.getConfiguration().get("mapred.fairscheduler.pool");
    int max = Integer.parseInt(p.substring(p.indexOf("l") + 1));

    if (splits.size() <= max)
        job.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        job.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;
}

From source file:hsyndicate.tools.BlockLocations.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path p = new Path(args[0]);
    Configuration conf = new Configuration();
    FileSystem fs = p.getFileSystem(conf);
    FileStatus f = fs.getFileStatus(p);/*ww w. j av  a2 s .c o  m*/
    BlockLocation[] bla = fs.getFileBlockLocations(f, 0, f.getLen());

    System.out.println("File : " + f.getPath().toString());
    for (BlockLocation bl : bla) {
        System.out.println("Offset : " + bl.getOffset());
        System.out.println("Len : " + bl.getLength());
        System.out.println("Hosts : " + makeCommaSeparated(bl.getHosts()));
        System.out.println("Names : " + makeCommaSeparated(bl.getNames()));
        System.out.println("TopologyPaths : " + makeCommaSeparated(bl.getTopologyPaths()));
    }
}

From source file:IndexService.IColumnInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;//from  w  w  w  .  j  a v  a2 s.c  om
    FileSystem fs = FileSystem.get(job);
    List<IColumnInputSplit> splits = new ArrayList<IColumnInputSplit>();
    HashMap<String, FileStatus> files = new HashMap<String, FileStatus>();
    String[] inputfiles = job.getStrings("mapred.input.dir");

    for (String file : inputfiles) {
        FileStatus[] fss = fs.globStatus(new Path(file + "_idx*"));
        FileStatus status = null;
        long length = 0;
        for (FileStatus ss : fss) {
            if (ss.getLen() > length) {
                length = ss.getLen();
                status = ss;
            }
        }
        files.put(file, status);
    }

    for (String filekey : files.keySet()) {
        FileStatus file = files.get(filekey);
        Path path = file.getPath();
        Path keypath = new Path(filekey);
        long length = file.getLen();

        tmpPath = keypath;

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length <= 1) {
            IColumnInputSplit split = new IColumnInputSplit(keypath, length, blkLocations[0].getHosts());
            splits.add(split);
        } else {

            String filename = path.toString();
            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();

            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                IColumnInputSplit split = new IColumnInputSplit(keypath, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }
    }

    if (splits.size() == 0) {
        splits.add(new IColumnInputSplit(tmpPath, 0, 0, 0, new String[0]));
    }

    System.out.println("Total # of splits: " + splits.size());
    return splits.toArray(new IColumnInputSplit[splits.size()]);

}

From source file:IndexService.IFormatInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;//from  w  w w.  j ava  2 s  .c om
    List<IFormatInputSplit> splits = new ArrayList<IFormatInputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length <= 1) {
            IFormatInputSplit split = new IFormatInputSplit(path, length, blkLocations[0].getHosts());
            splits.add(split);
        } else {
            String filename = path.toString();
            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();
            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                IFormatInputSplit split = new IFormatInputSplit(path, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }
        tmpPath = path;
    }

    if (splits.size() == 0) {
        splits.add(new IFormatInputSplit(tmpPath, 0, 0, 0, new String[0]));
        return splits.toArray(new IFormatInputSplit[splits.size()]);
    }

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new IFormatInputSplit[splits.size()]);

}

From source file:IndexService.IndexMergeIFormatInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;//from w w  w .  j  a v  a 2  s.  c  o  m
    List<IndexMergeIFormatSplit> splits = new ArrayList<IndexMergeIFormatSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length <= 1) {
            IndexMergeIFormatSplit split = new IndexMergeIFormatSplit(path, length, blkLocations[0].getHosts());
            splits.add(split);
        } else {

            String filename = path.toString();
            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();

            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                IndexMergeIFormatSplit split = new IndexMergeIFormatSplit(path, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }

        tmpPath = path;
    }

    if (splits.size() == 0) {
        splits.add(new IndexMergeIFormatSplit(tmpPath, 0, null, 0, null));
        return splits.toArray(new IndexMergeIFormatSplit[splits.size()]);
    }

    System.out.println("Total # of splits: " + splits.size());
    return splits.toArray(new IndexMergeIFormatSplit[splits.size()]);
}

From source file:input_format.MyFileInputFormat.java

License:Open Source License

/** 
 * Generate the list of files and make them into FileSplits.
 *//*  w w  w  .ja v a2  s. c o m*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new MyFileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());

    //String p=job.getConfiguration().get("mapred.fairscheduler.pool");
    //int max = Integer.parseInt(p.substring(p.indexOf("l")+1));
    int max = 1000;

    if (splits.size() <= max)
        job.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        job.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;
}

From source file:io.hops.erasure_coding.BaseEncodingManager.java

License:Apache License

/**
 * RAID an individual file/* ww  w. j  ava  2  s.c  o  m*/
 */
public static boolean doFileRaid(Configuration conf, Path sourceFile, Path parityPath, Codec codec,
        Statistics statistics, Progressable reporter, int targetRepl, int metaRepl) throws IOException {
    FileSystem srcFs = sourceFile.getFileSystem(conf);
    FileStatus sourceStatus = srcFs.getFileStatus(sourceFile);

    // extract block locations from File system
    BlockLocation[] locations = srcFs.getFileBlockLocations(sourceFile, 0, sourceStatus.getLen());
    // if the file has fewer than 2 blocks, then nothing to do
    if (locations.length <= 2) {
        return false;
    }

    // add up the raw disk space occupied by this file
    long diskSpace = 0;
    for (BlockLocation l : locations) {
        diskSpace += (l.getLength() * sourceStatus.getReplication());
    }
    statistics.numProcessedBlocks += locations.length;
    statistics.processedSize += diskSpace;

    // generate parity file
    generateParityFile(conf, sourceStatus, targetRepl, reporter, srcFs, parityPath, codec, locations.length,
            sourceStatus.getReplication(), metaRepl, sourceStatus.getBlockSize());
    if (srcFs.setReplication(sourceFile, (short) targetRepl) == false) {
        LOG.info("Error in reducing replication of " + sourceFile + " to " + targetRepl);
        statistics.remainingSize += diskSpace;
        return false;
    }
    ;

    diskSpace = 0;
    for (BlockLocation l : locations) {
        diskSpace += (l.getLength() * targetRepl);
    }
    statistics.remainingSize += diskSpace;

    // the metafile will have this many number of blocks
    int numMeta = locations.length / codec.stripeLength;
    if (locations.length % codec.stripeLength != 0) {
        numMeta++;
    }

    // we create numMeta for every file. This metablock has metaRepl # replicas.
    // the last block of the metafile might not be completely filled up, but we
    // ignore that for now.
    statistics.numMetaBlocks += (numMeta * metaRepl);
    statistics.metaSize += (numMeta * metaRepl * sourceStatus.getBlockSize());
    return true;
}

From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    //It generates the splits.
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
        Path filePath = file.getPath();

        //Calculates the content (array of double) length in bytes.
        FileSystem fs = filePath.getFileSystem(job.getConfiguration());
        FSDataInputStream dis = fs.open(filePath);
        JMATReader _matReader = new JMATReader(dis);
        JMATInfo _matdata = _matReader.getInfo();

        long length = _matdata.dataNumOfItems * MLDataType.miDOUBLE.bytes; //Content length.
        long lContentByteOffset = dis.getPos();

        _matReader.close();/*from w ww. ja  v a2 s  . c o m*/
        _matReader = null;
        dis = null;

        //Zero bytes, empty file split.
        if (length <= 0) {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(filePath, 0, length, new String[0]));
        }

        //Split the data.
        if (length > 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                blkLocations = fs.getFileBlockLocations(file, lContentByteOffset, length);
            }

            boolean isSplittable = isSplitable(job, filePath);
            LOG.debug("Current file to process " + filePath.getName() + ". Splittable? " + isSplittable);
            if (isSplittable) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;

                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(
                            makeSplit(filePath, lBlockByteStart, splitSize, blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                } //EndWhile.

                if (bytesRemaining != 0) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;
                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(makeSplit(filePath, lBlockByteStart, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(filePath, lContentByteOffset, length, blkLocations[0].getHosts()));
            }
        }
    } //EndFor.

    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexSplit.java

License:Open Source License

private String[] findBestLocations(Path[] indexFilePaths, Configuration conf) throws IOException {
    Hashtable<String, MutableInteger> blkLocations = new Hashtable<String, MutableInteger>();
    for (Path path : indexFilePaths) {
        FileSystem fs = path.getFileSystem(conf);
        FileStatus fileStatus = fs.getFileStatus(path);
        BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        for (BlockLocation location : fileBlockLocations) {
            for (String host : location.getHosts()) {
                MutableInteger cnt = blkLocations.get(host);
                if (cnt == null) {
                    blkLocations.put(host, new MutableInteger(1));
                } else {
                    cnt.increase();// ww w  .ja v  a 2s  .c om
                }
            }
        }
    }

    List<String> blkLocationsArr = new ArrayList<String>();
    for (String key : blkLocations.keySet()) {
        blkLocationsArr.add(key);
    }

    if (blkLocationsArr.size() == 0) {
        return new String[] { "localhost" };
    } else {
        return blkLocationsArr.toArray(new String[0]);
    }
}