Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from  w w  w.j  a va 2 s  . co m*/
protected List<InputSplit> getGuaguaSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaMapReduceConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining,
                        bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from   w ww.  j a v a 2  s . c o m*/
public static List<InputSplit> getFileSplits(Configuration conf, long splitSize) throws IOException {
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(conf);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(conf);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(conf, path)) {
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from w w w.  jav a 2 s.  co  m*/
public static List<InputSplit> getGuaguaSplits(Configuration conf, long splitSize) throws IOException {
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(conf);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(conf);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(conf, path)) {
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path,
                        length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()) }));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new GuaguaInputSplit(false,
                        new FileSplit[] { new FileSplit(path, length - bytesRemaining, bytesRemaining,
                                blkLocations[blkLocations.length - 1].getHosts()) }));
            }
        } else if (length != 0) {
            splits.add(new GuaguaInputSplit(false,
                    new FileSplit[] { new FileSplit(path, 0, length, blkLocations[0].getHosts()) }));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new GuaguaInputSplit(false,
                    new FileSplit[] { new FileSplit(path, 0, length, new String[0]) }));
        }
    }

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*ww w.java  2  s  .  c  o m*/
protected List<InputSplit> getVarSelectSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            // here double comparison can be directly used because of no precision requirement
            while (((double) bytesRemaining) / splitSize > 1.1d) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

private FileSplit getFileSplit(FileSystem fs, FileStatus file, long offset, long length) throws IOException {
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, offset, length);
    List<String> hosts = new ArrayList<String>();
    for (BlockLocation location : blkLocations) {
        hosts.addAll(Arrays.asList(location.getHosts()));
    }//  w w w.  j a va 2  s .  c o m
    String[] shosts = new String[hosts.size()];
    FileSplit fsp = new FileSplit(file.getPath(), offset, length, hosts.toArray(shosts));
    return fsp;
}

From source file:mvm.rya.accumulo.mr.utils.AccumuloHDFSFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
    //read the params from AccumuloInputFormat
    Configuration conf = jobContext.getConfiguration();
    Instance instance = AccumuloProps.getInstance(jobContext);
    String user = AccumuloProps.getUsername(jobContext);
    AuthenticationToken password = AccumuloProps.getPassword(jobContext);
    String table = AccumuloProps.getTablename(jobContext);
    ArgumentChecker.notNull(instance);/*  w  w  w . j  a  v  a 2 s  .  com*/
    ArgumentChecker.notNull(table);

    //find the files necessary
    try {
        AccumuloConfiguration acconf = instance.getConfiguration();
        FileSystem fs = FileSystem.get(conf);
        Connector connector = instance.getConnector(user, password);
        TableOperations tos = connector.tableOperations();
        String tableId = tos.tableIdMap().get(table);
        String filePrefix = acconf.get(Property.INSTANCE_DFS_DIR) + "/tables/" + tableId;
        System.out.println(filePrefix);

        Scanner scanner = connector.createScanner("!METADATA", Constants.NO_AUTHS); //TODO: auths?
        scanner.setRange(new Range(new Text(tableId + "\u0000"), new Text(tableId + "\uFFFD")));
        scanner.fetchColumnFamily(new Text("file"));
        List<String> files = new ArrayList<String>();
        List<InputSplit> fileSplits = new ArrayList<InputSplit>();
        Job job = new Job(conf);
        for (Map.Entry<Key, Value> entry : scanner) {
            String file = filePrefix + entry.getKey().getColumnQualifier().toString();
            files.add(file);
            Path path = new Path(file);
            FileStatus fileStatus = fs.getFileStatus(path);
            long len = fileStatus.getLen();
            BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, len);
            fileSplits.add(new FileSplit(path, 0, len, fileBlockLocations[0].getHosts()));
            //                FileInputFormat.addInputPath(job, path);
        }
        System.out.println(files);
        return fileSplits;
        //            return super.getSplits(job);
    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:org.apache.accumulo.server.test.performance.scan.CollectTabletStats.java

License:Apache License

private static void reportHdfsBlockLocations(List<String> files) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    System.out.println("\t\tFile block report : ");
    for (String file : files) {
        FileStatus status = fs.getFileStatus(new Path(file));

        if (status.isDir()) {
            // assume it is a map file
            status = fs.getFileStatus(new Path(file + "/data"));
        }/*from  w ww  . j  a  va  2  s . co m*/

        BlockLocation[] locs = fs.getFileBlockLocations(status, 0, status.getLen());

        System.out.println("\t\t\tBlocks for : " + file);

        for (BlockLocation blockLocation : locs) {
            System.out.printf("\t\t\t\t offset : %,13d  hosts :", blockLocation.getOffset());
            for (String host : blockLocation.getHosts()) {
                System.out.print(" " + host);
            }
            System.out.println();
        }
    }

    System.out.println();

}

From source file:org.apache.accumulo.server.util.LocalityCheck.java

License:Apache License

private void addBlocks(VolumeManager fs, String host, ArrayList<String> files, Map<String, Long> totalBlocks,
        Map<String, Long> localBlocks) throws Exception {
    long allBlocks = 0;
    long matchingBlocks = 0;
    if (!totalBlocks.containsKey(host)) {
        totalBlocks.put(host, 0L);/*ww  w  .  j  a v a2s.c om*/
        localBlocks.put(host, 0L);
    }
    for (String file : files) {
        Path filePath = new Path(file);
        FileSystem ns = fs.getVolumeByPath(filePath).getFileSystem();
        FileStatus fileStatus = ns.getFileStatus(filePath);
        BlockLocation[] fileBlockLocations = ns.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        for (BlockLocation blockLocation : fileBlockLocations) {
            allBlocks++;
            for (String location : blockLocation.getHosts()) {
                HostAndPort hap = HostAndPort.fromParts(location, 0);
                if (hap.getHostText().equals(host)) {
                    matchingBlocks++;
                    break;
                }
            }
        }
    }
    totalBlocks.put(host, allBlocks + totalBlocks.get(host));
    localBlocks.put(host, matchingBlocks + localBlocks.get(host));
}

From source file:org.apache.accumulo.test.performance.scan.CollectTabletStats.java

License:Apache License

private static void reportHdfsBlockLocations(List<FileRef> files) throws Exception {
    VolumeManager fs = VolumeManagerImpl.get();

    System.out.println("\t\tFile block report : ");
    for (FileRef file : files) {
        FileStatus status = fs.getFileStatus(file.path());

        if (status.isDirectory()) {
            // assume it is a map file
            status = fs.getFileStatus(new Path(file + "/data"));
        }/*  w  w w. ja  va 2  s .c o  m*/
        FileSystem ns = fs.getVolumeByPath(file.path()).getFileSystem();
        BlockLocation[] locs = ns.getFileBlockLocations(status, 0, status.getLen());

        System.out.println("\t\t\tBlocks for : " + file);

        for (BlockLocation blockLocation : locs) {
            System.out.printf("\t\t\t\t offset : %,13d  hosts :", blockLocation.getOffset());
            for (String host : blockLocation.getHosts()) {
                System.out.print(" " + host);
            }
            System.out.println();
        }
    }

    System.out.println();

}

From source file:org.apache.asterix.external.util.HDFSUtils.java

License:Apache License

/**
 * Instead of creating the split using the input format, we do it manually
 * This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions
 * and the produced splits only cover intersection between current files in hdfs and files stored internally
 * in AsterixDB/*from ww w .  j  ava  2 s  . c om*/
 * 1. NoOp means appended file
 * 2. AddOp means new file
 * 3. UpdateOp means the delta of a file
 * @return
 * @throws IOException
 */
public static InputSplit[] getSplits(JobConf conf, List<ExternalFile> files) throws IOException {
    // Create file system object
    FileSystem fs = FileSystem.get(conf);
    ArrayList<FileSplit> fileSplits = new ArrayList<FileSplit>();
    ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<ExternalFile>();
    // Create files splits
    for (ExternalFile file : files) {
        Path filePath = new Path(file.getFileName());
        FileStatus fileStatus;
        try {
            fileStatus = fs.getFileStatus(filePath);
        } catch (FileNotFoundException e) {
            // file was deleted at some point, skip to next file
            continue;
        }
        if (file.getPendingOp() == ExternalFilePendingOp.PENDING_ADD_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize());
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() < file.getSize()) {
                    fileSplits.add(new FileSplit(filePath, block.getOffset(),
                            (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength()
                                    : (file.getSize() - block.getOffset()),
                            block.getHosts()));
                    orderedExternalFiles.add(file);
                }
            }
        } else if (file.getPendingOp() == ExternalFilePendingOp.PENDING_NO_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            long oldSize = 0L;
            long newSize = file.getSize();
            for (int i = 0; i < files.size(); i++) {
                if (files.get(i).getFileName() == file.getFileName()
                        && files.get(i).getSize() != file.getSize()) {
                    newSize = files.get(i).getSize();
                    oldSize = file.getSize();
                    break;
                }
            }

            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize);
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() + block.getLength() > oldSize) {
                    if (block.getOffset() < newSize) {
                        // Block interact with delta -> Create a split
                        long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset();
                        long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L
                                : block.getOffset() + block.getLength() - newSize;
                        long splitLength = block.getLength() - startCut - endCut;
                        fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength,
                                block.getHosts()));
                        orderedExternalFiles.add(file);
                    }
                }
            }
        }
    }
    fs.close();
    files.clear();
    files.addAll(orderedExternalFiles);
    return fileSplits.toArray(new FileSplit[fileSplits.size()]);
}