Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException 

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from  w w  w.j  a va 2 s  . co m*/
protected List<InputSplit> getGuaguaSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaMapReduceConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining,
                        bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from   w ww.  j a v a 2  s . c o m*/
public static List<InputSplit> getFileSplits(Configuration conf, long splitSize) throws IOException {
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(conf);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(conf);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(conf, path)) {
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from w w w.  jav a 2 s.  co  m*/
public static List<InputSplit> getGuaguaSplits(Configuration conf, long splitSize) throws IOException {
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(conf);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(conf);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(conf, path)) {
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path,
                        length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()) }));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new GuaguaInputSplit(false,
                        new FileSplit[] { new FileSplit(path, length - bytesRemaining, bytesRemaining,
                                blkLocations[blkLocations.length - 1].getHosts()) }));
            }
        } else if (length != 0) {
            splits.add(new GuaguaInputSplit(false,
                    new FileSplit[] { new FileSplit(path, 0, length, blkLocations[0].getHosts()) }));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new GuaguaInputSplit(false,
                    new FileSplit[] { new FileSplit(path, 0, length, new String[0]) }));
        }
    }

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*ww w.java  2  s  .  c  o m*/
protected List<InputSplit> getVarSelectSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            // here double comparison can be directly used because of no precision requirement
            while (((double) bytesRemaining) / splitSize > 1.1d) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

private FileSplit getFileSplit(FileSystem fs, FileStatus file, long offset, long length) throws IOException {
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, offset, length);
    List<String> hosts = new ArrayList<String>();
    for (BlockLocation location : blkLocations) {
        hosts.addAll(Arrays.asList(location.getHosts()));
    }//  w w w.  j a va 2  s .  c o m
    String[] shosts = new String[hosts.size()];
    FileSplit fsp = new FileSplit(file.getPath(), offset, length, hosts.toArray(shosts));
    return fsp;
}

From source file:mvm.rya.accumulo.mr.utils.AccumuloHDFSFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
    //read the params from AccumuloInputFormat
    Configuration conf = jobContext.getConfiguration();
    Instance instance = AccumuloProps.getInstance(jobContext);
    String user = AccumuloProps.getUsername(jobContext);
    AuthenticationToken password = AccumuloProps.getPassword(jobContext);
    String table = AccumuloProps.getTablename(jobContext);
    ArgumentChecker.notNull(instance);/*  w  w  w . j  a  v  a 2 s  .  com*/
    ArgumentChecker.notNull(table);

    //find the files necessary
    try {
        AccumuloConfiguration acconf = instance.getConfiguration();
        FileSystem fs = FileSystem.get(conf);
        Connector connector = instance.getConnector(user, password);
        TableOperations tos = connector.tableOperations();
        String tableId = tos.tableIdMap().get(table);
        String filePrefix = acconf.get(Property.INSTANCE_DFS_DIR) + "/tables/" + tableId;
        System.out.println(filePrefix);

        Scanner scanner = connector.createScanner("!METADATA", Constants.NO_AUTHS); //TODO: auths?
        scanner.setRange(new Range(new Text(tableId + "\u0000"), new Text(tableId + "\uFFFD")));
        scanner.fetchColumnFamily(new Text("file"));
        List<String> files = new ArrayList<String>();
        List<InputSplit> fileSplits = new ArrayList<InputSplit>();
        Job job = new Job(conf);
        for (Map.Entry<Key, Value> entry : scanner) {
            String file = filePrefix + entry.getKey().getColumnQualifier().toString();
            files.add(file);
            Path path = new Path(file);
            FileStatus fileStatus = fs.getFileStatus(path);
            long len = fileStatus.getLen();
            BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, len);
            fileSplits.add(new FileSplit(path, 0, len, fileBlockLocations[0].getHosts()));
            //                FileInputFormat.addInputPath(job, path);
        }
        System.out.println(files);
        return fileSplits;
        //            return super.getSplits(job);
    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:org.apache.accumulo.server.test.performance.scan.CollectTabletStats.java

License:Apache License

private static void reportHdfsBlockLocations(List<String> files) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    System.out.println("\t\tFile block report : ");
    for (String file : files) {
        FileStatus status = fs.getFileStatus(new Path(file));

        if (status.isDir()) {
            // assume it is a map file
            status = fs.getFileStatus(new Path(file + "/data"));
        }/*from  w ww  . j  a  va  2  s . co m*/

        BlockLocation[] locs = fs.getFileBlockLocations(status, 0, status.getLen());

        System.out.println("\t\t\tBlocks for : " + file);

        for (BlockLocation blockLocation : locs) {
            System.out.printf("\t\t\t\t offset : %,13d  hosts :", blockLocation.getOffset());
            for (String host : blockLocation.getHosts()) {
                System.out.print(" " + host);
            }
            System.out.println();
        }
    }

    System.out.println();

}

From source file:org.apache.accumulo.server.util.LocalityCheck.java

License:Apache License

private void addBlocks(VolumeManager fs, String host, ArrayList<String> files, Map<String, Long> totalBlocks,
        Map<String, Long> localBlocks) throws Exception {
    long allBlocks = 0;
    long matchingBlocks = 0;
    if (!totalBlocks.containsKey(host)) {
        totalBlocks.put(host, 0L);/*ww  w  .  j  a v a2s.c om*/
        localBlocks.put(host, 0L);
    }
    for (String file : files) {
        Path filePath = new Path(file);
        FileSystem ns = fs.getVolumeByPath(filePath).getFileSystem();
        FileStatus fileStatus = ns.getFileStatus(filePath);
        BlockLocation[] fileBlockLocations = ns.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        for (BlockLocation blockLocation : fileBlockLocations) {
            allBlocks++;
            for (String location : blockLocation.getHosts()) {
                HostAndPort hap = HostAndPort.fromParts(location, 0);
                if (hap.getHostText().equals(host)) {
                    matchingBlocks++;
                    break;
                }
            }
        }
    }
    totalBlocks.put(host, allBlocks + totalBlocks.get(host));
    localBlocks.put(host, matchingBlocks + localBlocks.get(host));
}

From source file:org.apache.accumulo.test.performance.scan.CollectTabletStats.java

License:Apache License

private static void reportHdfsBlockLocations(List<FileRef> files) throws Exception {
    VolumeManager fs = VolumeManagerImpl.get();

    System.out.println("\t\tFile block report : ");
    for (FileRef file : files) {
        FileStatus status = fs.getFileStatus(file.path());

        if (status.isDirectory()) {
            // assume it is a map file
            status = fs.getFileStatus(new Path(file + "/data"));
        }/*  w  w w. ja  va 2  s .c o  m*/
        FileSystem ns = fs.getVolumeByPath(file.path()).getFileSystem();
        BlockLocation[] locs = ns.getFileBlockLocations(status, 0, status.getLen());

        System.out.println("\t\t\tBlocks for : " + file);

        for (BlockLocation blockLocation : locs) {
            System.out.printf("\t\t\t\t offset : %,13d  hosts :", blockLocation.getOffset());
            for (String host : blockLocation.getHosts()) {
                System.out.print(" " + host);
            }
            System.out.println();
        }
    }

    System.out.println();

}

From source file:org.apache.asterix.external.util.HDFSUtils.java

License:Apache License

/**
 * Instead of creating the split using the input format, we do it manually
 * This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions
 * and the produced splits only cover intersection between current files in hdfs and files stored internally
 * in AsterixDB/*from ww w .  j  ava  2 s  . c om*/
 * 1. NoOp means appended file
 * 2. AddOp means new file
 * 3. UpdateOp means the delta of a file
 * @return
 * @throws IOException
 */
public static InputSplit[] getSplits(JobConf conf, List<ExternalFile> files) throws IOException {
    // Create file system object
    FileSystem fs = FileSystem.get(conf);
    ArrayList<FileSplit> fileSplits = new ArrayList<FileSplit>();
    ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<ExternalFile>();
    // Create files splits
    for (ExternalFile file : files) {
        Path filePath = new Path(file.getFileName());
        FileStatus fileStatus;
        try {
            fileStatus = fs.getFileStatus(filePath);
        } catch (FileNotFoundException e) {
            // file was deleted at some point, skip to next file
            continue;
        }
        if (file.getPendingOp() == ExternalFilePendingOp.PENDING_ADD_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize());
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() < file.getSize()) {
                    fileSplits.add(new FileSplit(filePath, block.getOffset(),
                            (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength()
                                    : (file.getSize() - block.getOffset()),
                            block.getHosts()));
                    orderedExternalFiles.add(file);
                }
            }
        } else if (file.getPendingOp() == ExternalFilePendingOp.PENDING_NO_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            long oldSize = 0L;
            long newSize = file.getSize();
            for (int i = 0; i < files.size(); i++) {
                if (files.get(i).getFileName() == file.getFileName()
                        && files.get(i).getSize() != file.getSize()) {
                    newSize = files.get(i).getSize();
                    oldSize = file.getSize();
                    break;
                }
            }

            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize);
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() + block.getLength() > oldSize) {
                    if (block.getOffset() < newSize) {
                        // Block interact with delta -> Create a split
                        long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset();
                        long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L
                                : block.getOffset() + block.getLength() - newSize;
                        long splitLength = block.getLength() - startCut - endCut;
                        fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength,
                                block.getHosts()));
                        orderedExternalFiles.add(file);
                    }
                }
            }
        }
    }
    fs.close();
    files.clear();
    files.addAll(orderedExternalFiles);
    return fileSplits.toArray(new FileSplit[fileSplits.size()]);
}