Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*from  w w  w.j av  a 2s . co m*/
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = new Stopwatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsedMillis());
    }
    return splits;
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///  w  w w.  j  a v  a  2 s .c o  m
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    FileStatus[] files = listStatus(job);

    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1),
            minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, bytesRemaining,
                        clusterMap);
                splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(makeSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.zjy.mongo.splitter.BSONSplitter.java

License:Apache License

public BSONFileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart,
        final long splitLen) {
    BSONFileSplit split;/* w ww . java 2  s  .c o m*/
    try {
        BlockLocation[] blkLocations;

        // This code is based off of org.apache.hadoop.mapreduce.lib
        // .input.FileInputFormat.getSplits()
        if (inFile instanceof LocatedFileStatus) {
            blkLocations = ((LocatedFileStatus) inFile).getBlockLocations();
        } else {
            blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen);
        }

        int blockIndex = getBlockIndex(blkLocations, splitStart);
        split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts());
    } catch (IOException e) {
        LOG.warn(
                "Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; "
                        + e.getMessage());
        split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, null);
    }
    split.setKeyField(MongoConfigUtil.getInputKey(getConf()));
    return split;
}

From source file:edu.iu.common.MultiFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    // Generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration();
    int numMaps = jobConf.getNumMapTasks();
    LOG.info("NUMBER OF FILES: " + files.size());
    LOG.info("NUMBER OF MAPS: " + numMaps);
    int avg = files.size() / numMaps;
    int rest = files.size() % numMaps;
    int tmp = 0;/*from w  ww  .ja va2 s .c o m*/
    long length = 0;
    List<Path> pathList = null;
    Set<String> hostSet = null;
    // Random random = new Random(System.nanoTime());
    for (FileStatus file : files) {
        if (tmp == 0) {
            pathList = new ArrayList<Path>();
            hostSet = new HashSet<String>();
        }
        if (tmp < avg) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            tmp++;
            if (tmp == avg && rest == 0) {
                LOG.info("Split on host: " + getHostsString(hostSet));
                splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
                tmp = 0;
                length = 0;
            }
        } else if (tmp == avg && rest > 0) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            rest--;
            LOG.info("Split on host: " + getHostsString(hostSet));
            splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
            tmp = 0;
            length = 0;
        }
    }
    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, numMaps);
    LOG.info("Total # of splits: " + splits.size());
    return splits;
}

From source file:edu.iu.fileformat.MultiFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    // Generate splits
    List<InputSplit> splits = new ArrayList<>();
    List<FileStatus> files = listStatus(job);
    org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration();
    int numMaps = jobConf.getNumMapTasks();
    LOG.info("NUMBER OF FILES: " + files.size());
    LOG.info("NUMBER OF MAPS: " + numMaps);
    // randomizeFileListOrder(files);
    int avg = files.size() / numMaps;
    int rest = files.size() % numMaps;
    int tmp = 0;/*from w  w w . jav  a  2  s  . co m*/
    long length = 0;
    List<Path> pathList = null;
    Set<String> hostSet = null;
    for (FileStatus file : files) {
        if (tmp == 0) {
            pathList = new ArrayList<>();
            hostSet = new HashSet<>();
        }
        if (tmp < avg) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            tmp++;
            if (tmp == avg && rest == 0) {
                LOG.info("Split on host: " + getHostsString(hostSet));
                splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
                tmp = 0;
                length = 0;
            }
        } else if (tmp == avg && rest > 0) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            rest--;
            LOG.info("Split on host: " + getHostsString(hostSet));
            splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
            tmp = 0;
            length = 0;
        }
    }
    // Save the number of input files in the
    // job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, numMaps);
    LOG.info("Total # of splits: " + splits.size());
    return splits;
}

From source file:edu.uci.ics.asterix.external.adapter.factory.HDFSAdapterFactory.java

License:Apache License

/**
 * Instead of creating the split using the input format, we do it manually
 * This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions
 * and the produced splits only cover intersection between current files in hdfs and files stored internally
 * in AsterixDB/*from   w ww .ja  va 2  s  . c om*/
 * 1. NoOp means appended file
 * 2. AddOp means new file
 * 3. UpdateOp means the delta of a file
 *
 * @return
 * @throws IOException
 */
protected InputSplit[] getSplits(JobConf conf) throws IOException {
    // Create file system object
    FileSystem fs = FileSystem.get(conf);
    ArrayList<FileSplit> fileSplits = new ArrayList<FileSplit>();
    ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<ExternalFile>();
    // Create files splits
    for (ExternalFile file : files) {
        Path filePath = new Path(file.getFileName());
        FileStatus fileStatus;
        try {
            fileStatus = fs.getFileStatus(filePath);
        } catch (FileNotFoundException e) {
            // file was deleted at some point, skip to next file
            continue;
        }
        if (file.getPendingOp() == ExternalFilePendingOp.PENDING_ADD_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize());
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() < file.getSize()) {
                    fileSplits.add(new FileSplit(filePath, block.getOffset(),
                            (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength()
                                    : (file.getSize() - block.getOffset()),
                            block.getHosts()));
                    orderedExternalFiles.add(file);
                }
            }
        } else if (file.getPendingOp() == ExternalFilePendingOp.PENDING_NO_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            long oldSize = 0L;
            long newSize = file.getSize();
            for (int i = 0; i < files.size(); i++) {
                if (files.get(i).getFileName() == file.getFileName()
                        && files.get(i).getSize() != file.getSize()) {
                    newSize = files.get(i).getSize();
                    oldSize = file.getSize();
                    break;
                }
            }

            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize);
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() + block.getLength() > oldSize) {
                    if (block.getOffset() < newSize) {
                        // Block interact with delta -> Create a split
                        long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset();
                        long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L
                                : block.getOffset() + block.getLength() - newSize;
                        long splitLength = block.getLength() - startCut - endCut;
                        fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength,
                                block.getHosts()));
                        orderedExternalFiles.add(file);
                    }
                }
            }
        }
    }
    fs.close();
    files = orderedExternalFiles;
    return fileSplits.toArray(new FileSplit[fileSplits.size()]);
}

From source file:edu.ucsb.cs.hadoop.CustomFileInputFormat.java

License:Apache License

/**
 * Splits files returned by {@link #listStatus(JobConf)} when they're too
 * big.// w  w w.  jav  a2 s.c om
 */
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    FileStatus[] files = listStatus(job);

    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(new FileSplit(path, 0, length, splitHosts));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:edu.umn.cs.spatialHadoop.mapred.CombinedSpatialInputFormat.java

License:Apache License

public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    FileSystem fs = path.getFileSystem(job);
    FileStatus file = fs.getFileStatus(path);
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long splitSize = blockSize;

        long bytesRemaining = length;
        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
            String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
            splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
            bytesRemaining -= splitSize;
        }/*from   w  w  w.j  a  v  a  2s .  c  o m*/

        if (bytesRemaining != 0) {
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
    } else if (length != 0) {
        String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
        splits.add(new FileSplit(path, 0, length, splitHosts));
    } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Spatially joins two files.//  w w w  .java 2  s.  c  o m
 * @param inputFiles
 * @param userOutputPath
 * @param params
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
public static long distributedJoinSmart(final Path[] inputFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    Path[] originalInputFiles = inputFiles.clone();
    FileSystem outFs = inputFiles[0].getFileSystem(params);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }

    // Decide whether to do a repartition step or not
    int cost_with_repartition, cost_without_repartition;
    final FileStatus[] fStatus = new FileStatus[inputFiles.length];
    for (int i_file = 0; i_file < inputFiles.length; i_file++) {
        // TODO work with folders. Calculate size more accurately
        FileSystem fs = inputFiles[i_file].getFileSystem(params);
        fStatus[i_file] = fs.getFileStatus(inputFiles[i_file]);
    }

    // Sort files by length (size)
    IndexedSortable filesBySize = new IndexedSortable() {
        @Override
        public void swap(int i, int j) {
            Path tmp1 = inputFiles[i];
            inputFiles[i] = inputFiles[j];
            inputFiles[j] = tmp1;

            FileStatus tmp2 = fStatus[i];
            fStatus[i] = fStatus[j];
            fStatus[j] = tmp2;
        }

        @Override
        public int compare(int i, int j) {
            if (fStatus[i].getLen() < fStatus[j].getLen())
                return 0;
            return fStatus[i].getLen() < fStatus[j].getLen() ? -1 : 1;
        }
    };

    new QuickSort().sort(filesBySize, 0, inputFiles.length);
    GlobalIndex<Partition>[] gIndexes = new GlobalIndex[fStatus.length];
    int[] numBlocks = new int[fStatus.length];
    for (int i_file = 0; i_file < fStatus.length; i_file++) {
        gIndexes[i_file] = SpatialSite.getGlobalIndex(outFs, fStatus[i_file].getPath());
        if (gIndexes[i_file] != null) {
            // Number of blocks is equal to number of partitions in global
            // index
            numBlocks[i_file] = gIndexes[i_file].size();
        } else if (fStatus[i_file].isDir()) {
            // Add up number of file system blocks in all subfiles of this
            // directory
            numBlocks[i_file] = 0;
            FileStatus[] subfiles = outFs.listStatus(inputFiles[i_file], SpatialSite.NonHiddenFileFilter);
            for (FileStatus subfile : subfiles) {
                numBlocks[i_file] += outFs.getFileBlockLocations(subfile, 0, subfile.getLen()).length;
            }
        } else {
            // Number of file system blocks in input file
            numBlocks[i_file] = outFs.getFileBlockLocations(fStatus[i_file], 0,
                    fStatus[i_file].getLen()).length;
        }
    }

    cost_without_repartition = gIndexes[0] != null && gIndexes[1] != null
            ? GlobalIndex.spatialJoin(gIndexes[0], gIndexes[1], null)
            : (numBlocks[0] * numBlocks[1]);
    // Total cost = Cost of repartition (=== 2 * numBlocks[0]) +
    // cost of join (=== numBlocks[0] + numBlocks[1])
    cost_with_repartition = numBlocks[0] * 3 + numBlocks[1];
    LOG.info("Cost with repartition is estimated to " + cost_with_repartition);
    LOG.info("Cost without repartition is estimated to " + cost_without_repartition);
    boolean need_repartition = cost_with_repartition < cost_without_repartition;
    if (need_repartition) {
        int file_to_repartition = selectRepartition(inputFiles, params);
        repartitionStep(inputFiles, file_to_repartition, params);
    }

    // Restore inputFiles to the original order by user
    if (inputFiles[1] != originalInputFiles[1]) {
        Path temp = inputFiles[0];
        inputFiles[0] = inputFiles[1];
        inputFiles[1] = temp;
    }

    // Redistribute join the larger file and the partitioned file
    long result_size = DistributedJoin.joinStep(inputFiles, outputPath, params);

    if (userOutputPath == null)
        outFs.delete(outputPath, true);

    return result_size;
}

From source file:edu.umn.cs.spatialHadoop.ReadFile.java

License:Open Source License

public static void main(String[] args) throws Exception {
    OperationsParams cla = new OperationsParams(new GenericOptionsParser(args));
    Path input = cla.getPath();/*from   www  .j a  v  a2s  .  c o m*/
    if (input == null) {
        printUsage();
        throw new RuntimeException("Illegal parameters");
    }
    Configuration conf = new Configuration();
    Path inFile = new Path(args[0]);
    FileSystem fs = inFile.getFileSystem(conf);

    long length = fs.getFileStatus(inFile).getLen();

    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile);
    if (gindex == null) {
        BlockLocation[] locations = cla.getInt("offset", 0) == -1
                ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length)
                : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getInt("offset", 0), 1);
        System.out.println(locations.length + " heap blocks");
    } else {
        for (Partition p : gindex) {
            long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen();
            System.out.println(p + " --- " + partition_length);
        }
    }
}