Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException 

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*from  w w  w.j av  a 2s . co m*/
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = new Stopwatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsedMillis());
    }
    return splits;
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///  w  w w.  j  a v  a  2 s .c o  m
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    FileStatus[] files = listStatus(job);

    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1),
            minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, bytesRemaining,
                        clusterMap);
                splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(makeSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.zjy.mongo.splitter.BSONSplitter.java

License:Apache License

public BSONFileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart,
        final long splitLen) {
    BSONFileSplit split;/* w ww . java 2  s  .c o m*/
    try {
        BlockLocation[] blkLocations;

        // This code is based off of org.apache.hadoop.mapreduce.lib
        // .input.FileInputFormat.getSplits()
        if (inFile instanceof LocatedFileStatus) {
            blkLocations = ((LocatedFileStatus) inFile).getBlockLocations();
        } else {
            blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen);
        }

        int blockIndex = getBlockIndex(blkLocations, splitStart);
        split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts());
    } catch (IOException e) {
        LOG.warn(
                "Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; "
                        + e.getMessage());
        split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, null);
    }
    split.setKeyField(MongoConfigUtil.getInputKey(getConf()));
    return split;
}

From source file:edu.iu.common.MultiFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    // Generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration();
    int numMaps = jobConf.getNumMapTasks();
    LOG.info("NUMBER OF FILES: " + files.size());
    LOG.info("NUMBER OF MAPS: " + numMaps);
    int avg = files.size() / numMaps;
    int rest = files.size() % numMaps;
    int tmp = 0;/*from w  ww  .ja va2 s .c o m*/
    long length = 0;
    List<Path> pathList = null;
    Set<String> hostSet = null;
    // Random random = new Random(System.nanoTime());
    for (FileStatus file : files) {
        if (tmp == 0) {
            pathList = new ArrayList<Path>();
            hostSet = new HashSet<String>();
        }
        if (tmp < avg) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            tmp++;
            if (tmp == avg && rest == 0) {
                LOG.info("Split on host: " + getHostsString(hostSet));
                splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
                tmp = 0;
                length = 0;
            }
        } else if (tmp == avg && rest > 0) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            rest--;
            LOG.info("Split on host: " + getHostsString(hostSet));
            splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
            tmp = 0;
            length = 0;
        }
    }
    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, numMaps);
    LOG.info("Total # of splits: " + splits.size());
    return splits;
}

From source file:edu.iu.fileformat.MultiFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    // Generate splits
    List<InputSplit> splits = new ArrayList<>();
    List<FileStatus> files = listStatus(job);
    org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration();
    int numMaps = jobConf.getNumMapTasks();
    LOG.info("NUMBER OF FILES: " + files.size());
    LOG.info("NUMBER OF MAPS: " + numMaps);
    // randomizeFileListOrder(files);
    int avg = files.size() / numMaps;
    int rest = files.size() % numMaps;
    int tmp = 0;/*from w  w w . jav  a  2  s  . co m*/
    long length = 0;
    List<Path> pathList = null;
    Set<String> hostSet = null;
    for (FileStatus file : files) {
        if (tmp == 0) {
            pathList = new ArrayList<>();
            hostSet = new HashSet<>();
        }
        if (tmp < avg) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            tmp++;
            if (tmp == avg && rest == 0) {
                LOG.info("Split on host: " + getHostsString(hostSet));
                splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
                tmp = 0;
                length = 0;
            }
        } else if (tmp == avg && rest > 0) {
            pathList.add(file.getPath());
            length = length + file.getLen();
            FileSystem fs = file.getPath().getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            for (BlockLocation blockLocation : blkLocations) {
                for (String host : blockLocation.getHosts()) {
                    hostSet.add(host);
                }
            }
            rest--;
            LOG.info("Split on host: " + getHostsString(hostSet));
            splits.add(new MultiFileSplit(pathList, length, hostSet.toArray(new String[0])));
            tmp = 0;
            length = 0;
        }
    }
    // Save the number of input files in the
    // job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, numMaps);
    LOG.info("Total # of splits: " + splits.size());
    return splits;
}

From source file:edu.uci.ics.asterix.external.adapter.factory.HDFSAdapterFactory.java

License:Apache License

/**
 * Instead of creating the split using the input format, we do it manually
 * This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions
 * and the produced splits only cover intersection between current files in hdfs and files stored internally
 * in AsterixDB/*from   w ww .ja  va 2  s  . c om*/
 * 1. NoOp means appended file
 * 2. AddOp means new file
 * 3. UpdateOp means the delta of a file
 *
 * @return
 * @throws IOException
 */
protected InputSplit[] getSplits(JobConf conf) throws IOException {
    // Create file system object
    FileSystem fs = FileSystem.get(conf);
    ArrayList<FileSplit> fileSplits = new ArrayList<FileSplit>();
    ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<ExternalFile>();
    // Create files splits
    for (ExternalFile file : files) {
        Path filePath = new Path(file.getFileName());
        FileStatus fileStatus;
        try {
            fileStatus = fs.getFileStatus(filePath);
        } catch (FileNotFoundException e) {
            // file was deleted at some point, skip to next file
            continue;
        }
        if (file.getPendingOp() == ExternalFilePendingOp.PENDING_ADD_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize());
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() < file.getSize()) {
                    fileSplits.add(new FileSplit(filePath, block.getOffset(),
                            (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength()
                                    : (file.getSize() - block.getOffset()),
                            block.getHosts()));
                    orderedExternalFiles.add(file);
                }
            }
        } else if (file.getPendingOp() == ExternalFilePendingOp.PENDING_NO_OP
                && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
            long oldSize = 0L;
            long newSize = file.getSize();
            for (int i = 0; i < files.size(); i++) {
                if (files.get(i).getFileName() == file.getFileName()
                        && files.get(i).getSize() != file.getSize()) {
                    newSize = files.get(i).getSize();
                    oldSize = file.getSize();
                    break;
                }
            }

            // Get its information from HDFS name node
            BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize);
            // Create a split per block
            for (BlockLocation block : fileBlocks) {
                if (block.getOffset() + block.getLength() > oldSize) {
                    if (block.getOffset() < newSize) {
                        // Block interact with delta -> Create a split
                        long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset();
                        long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L
                                : block.getOffset() + block.getLength() - newSize;
                        long splitLength = block.getLength() - startCut - endCut;
                        fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength,
                                block.getHosts()));
                        orderedExternalFiles.add(file);
                    }
                }
            }
        }
    }
    fs.close();
    files = orderedExternalFiles;
    return fileSplits.toArray(new FileSplit[fileSplits.size()]);
}

From source file:edu.ucsb.cs.hadoop.CustomFileInputFormat.java

License:Apache License

/**
 * Splits files returned by {@link #listStatus(JobConf)} when they're too
 * big.// w  w w.  jav  a2 s.c om
 */
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    FileStatus[] files = listStatus(job);

    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(new FileSplit(path, 0, length, splitHosts));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:edu.umn.cs.spatialHadoop.mapred.CombinedSpatialInputFormat.java

License:Apache License

public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    FileSystem fs = path.getFileSystem(job);
    FileStatus file = fs.getFileStatus(path);
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long splitSize = blockSize;

        long bytesRemaining = length;
        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
            String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
            splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
            bytesRemaining -= splitSize;
        }/*from   w  w  w.j  a  v  a  2s .  c  o m*/

        if (bytesRemaining != 0) {
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
    } else if (length != 0) {
        String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
        splits.add(new FileSplit(path, 0, length, splitHosts));
    } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Spatially joins two files.//  w w w  .java 2  s.  c  o m
 * @param inputFiles
 * @param userOutputPath
 * @param params
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
public static long distributedJoinSmart(final Path[] inputFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    Path[] originalInputFiles = inputFiles.clone();
    FileSystem outFs = inputFiles[0].getFileSystem(params);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }

    // Decide whether to do a repartition step or not
    int cost_with_repartition, cost_without_repartition;
    final FileStatus[] fStatus = new FileStatus[inputFiles.length];
    for (int i_file = 0; i_file < inputFiles.length; i_file++) {
        // TODO work with folders. Calculate size more accurately
        FileSystem fs = inputFiles[i_file].getFileSystem(params);
        fStatus[i_file] = fs.getFileStatus(inputFiles[i_file]);
    }

    // Sort files by length (size)
    IndexedSortable filesBySize = new IndexedSortable() {
        @Override
        public void swap(int i, int j) {
            Path tmp1 = inputFiles[i];
            inputFiles[i] = inputFiles[j];
            inputFiles[j] = tmp1;

            FileStatus tmp2 = fStatus[i];
            fStatus[i] = fStatus[j];
            fStatus[j] = tmp2;
        }

        @Override
        public int compare(int i, int j) {
            if (fStatus[i].getLen() < fStatus[j].getLen())
                return 0;
            return fStatus[i].getLen() < fStatus[j].getLen() ? -1 : 1;
        }
    };

    new QuickSort().sort(filesBySize, 0, inputFiles.length);
    GlobalIndex<Partition>[] gIndexes = new GlobalIndex[fStatus.length];
    int[] numBlocks = new int[fStatus.length];
    for (int i_file = 0; i_file < fStatus.length; i_file++) {
        gIndexes[i_file] = SpatialSite.getGlobalIndex(outFs, fStatus[i_file].getPath());
        if (gIndexes[i_file] != null) {
            // Number of blocks is equal to number of partitions in global
            // index
            numBlocks[i_file] = gIndexes[i_file].size();
        } else if (fStatus[i_file].isDir()) {
            // Add up number of file system blocks in all subfiles of this
            // directory
            numBlocks[i_file] = 0;
            FileStatus[] subfiles = outFs.listStatus(inputFiles[i_file], SpatialSite.NonHiddenFileFilter);
            for (FileStatus subfile : subfiles) {
                numBlocks[i_file] += outFs.getFileBlockLocations(subfile, 0, subfile.getLen()).length;
            }
        } else {
            // Number of file system blocks in input file
            numBlocks[i_file] = outFs.getFileBlockLocations(fStatus[i_file], 0,
                    fStatus[i_file].getLen()).length;
        }
    }

    cost_without_repartition = gIndexes[0] != null && gIndexes[1] != null
            ? GlobalIndex.spatialJoin(gIndexes[0], gIndexes[1], null)
            : (numBlocks[0] * numBlocks[1]);
    // Total cost = Cost of repartition (=== 2 * numBlocks[0]) +
    // cost of join (=== numBlocks[0] + numBlocks[1])
    cost_with_repartition = numBlocks[0] * 3 + numBlocks[1];
    LOG.info("Cost with repartition is estimated to " + cost_with_repartition);
    LOG.info("Cost without repartition is estimated to " + cost_without_repartition);
    boolean need_repartition = cost_with_repartition < cost_without_repartition;
    if (need_repartition) {
        int file_to_repartition = selectRepartition(inputFiles, params);
        repartitionStep(inputFiles, file_to_repartition, params);
    }

    // Restore inputFiles to the original order by user
    if (inputFiles[1] != originalInputFiles[1]) {
        Path temp = inputFiles[0];
        inputFiles[0] = inputFiles[1];
        inputFiles[1] = temp;
    }

    // Redistribute join the larger file and the partitioned file
    long result_size = DistributedJoin.joinStep(inputFiles, outputPath, params);

    if (userOutputPath == null)
        outFs.delete(outputPath, true);

    return result_size;
}

From source file:edu.umn.cs.spatialHadoop.ReadFile.java

License:Open Source License

public static void main(String[] args) throws Exception {
    OperationsParams cla = new OperationsParams(new GenericOptionsParser(args));
    Path input = cla.getPath();/*from   www  .j a  v  a2s  .  c o m*/
    if (input == null) {
        printUsage();
        throw new RuntimeException("Illegal parameters");
    }
    Configuration conf = new Configuration();
    Path inFile = new Path(args[0]);
    FileSystem fs = inFile.getFileSystem(conf);

    long length = fs.getFileStatus(inFile).getLen();

    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile);
    if (gindex == null) {
        BlockLocation[] locations = cla.getInt("offset", 0) == -1
                ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length)
                : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getInt("offset", 0), 1);
        System.out.println(locations.length + " heap blocks");
    } else {
        for (Partition p : gindex) {
            long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen();
            System.out.println(p + " --- " + partition_length);
        }
    }
}