Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:PageInputFormat.java

License:Apache License

public InputSplit[] getSplits(JobConf job, int num) throws IOException {
    long minSize = 1;
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            FileSystem fs = path.getFileSystem(job);
            blkLocations = fs.getFileBlockLocations(file, 0, length);
            if (isSplitable(path.getFileSystem(job), path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }//from ww w.j a v a  2 s.  c  o  m

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
        } else
            splits.add(makeSplit(path, 0, length, new String[0]));
    }
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    return splits.toArray(new InputSplit[0]);

}

From source file:StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();//ww w  . j  a v  a 2s. com
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        InputStream is = null;
        long start = 0;
        long skip = 0;
        if (is != null) {
            // start = is.getAdjustedStart();
            // length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                // SplitCompressionInputStream cin =
                // in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in);

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        // Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
    }
    return splits;
}

From source file:DupleInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context// ww  w .  j  av a  2  s .c o  m
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    // times that each file exists in the files List
    ArrayList<Integer> times = new ArrayList<Integer>();
    ArrayList<Path> paths = new ArrayList<Path>();

    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

            int index;
            if ((index = paths.indexOf(path)) != -1)
                times.set(index, times.get(index) + 1);
            else {
                times.add(0);
                paths.add(path);
                index = times.size() - 1;
            }

            // not splitable
            splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), times.get(index)));

        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    //LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:HDFSFileFinder.java

License:Apache License

private static void getBlockLocationsFromHdfs() {
    StringBuilder sb = new StringBuilder();
    Configuration conf = new Configuration();
    boolean first = true;

    // make connection to hdfs
    try {//from  www  . j  a  v  a  2  s  . c  o m
        if (verbose) {
            writer.println("DEBUG: Trying to connect to " + fsName);
        }
        FileSystem fs = FileSystem.get(conf);
        Path file = new Path(fileName);
        FileStatus fStatus = fs.getFileStatus(file);
        status = fStatus;
        bLocations = fs.getFileBlockLocations(status, 0, status.getLen());
        //print out all block locations
        for (BlockLocation aLocation : bLocations) {
            String[] names = aLocation.getHosts();
            for (String name : names) {
                InetAddress addr = InetAddress.getByName(name);
                String host = addr.getHostName();
                int idx = host.indexOf('.');
                String hostname;
                if (0 < idx) {
                    hostname = host.substring(0, host.indexOf('.'));
                } else {
                    hostname = host;
                }
                if (first) {
                    sb.append(hostname);
                    first = false;
                } else {
                    sb.append(",").append(hostname);
                }
            }
        }
        sb.append(NEWLINE);
    } catch (IOException e) {
        writer.println("Error getting block location data from namenode");
        e.printStackTrace();
    }
    writer.print(sb.toString());
    writer.flush();
}

From source file:a.TestConcatExample.java

License:Apache License

@Test
public void concatIsPermissive() throws IOException, URISyntaxException {
    MiniDFSCluster cluster = null;/*from w w w  .j  a  va  2  s.com*/
    final Configuration conf = WebHdfsTestUtil.createConf();
    conf.set("dfs.namenode.fs-limits.min-block-size", "1000"); // Allow tiny blocks for the test
    try {
        cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build();
        cluster.waitActive();
        final FileSystem webHdfs = WebHdfsTestUtil.getWebHdfsFileSystem(conf, WebHdfsFileSystem.SCHEME);
        final FileSystem dfs = cluster.getFileSystem();

        final FileSystem fs = dfs; // WebHDFS has a bug in getLocatedBlocks

        Path root = new Path("/dir");
        fs.mkdirs(root);

        short origRep = 3;
        short secondRep = (short) (origRep - 1);
        Path f1 = new Path("/dir/f1");
        long size1 = writeFile(fs, f1, /* blocksize */ 4096, origRep, 5);
        long f1NumBlocks = fs.getFileBlockLocations(f1, 0, size1).length;
        assertEquals(5, f1NumBlocks);

        Path f2 = new Path("/dir/f2");
        long size2 = writeFile(fs, f2, /* blocksize (must divide 512 for checksum) */ 4096 - 512, secondRep, 4);
        long f2NumBlocks = fs.getFileBlockLocations(f2, 0, size2).length;
        assertEquals(5, f2NumBlocks);

        fs.concat(f1, new Path[] { f2 });
        FileStatus[] fileStatuses = fs.listStatus(root);

        // Only one file should remain
        assertEquals(1, fileStatuses.length);
        FileStatus fileStatus = fileStatuses[0];

        // And it should be named after the first file
        assertEquals("f1", fileStatus.getPath().getName());

        // The entire file takes the replication of the first argument
        assertEquals(origRep, fileStatus.getReplication());

        // As expected, the new concated file is the length of both the previous files
        assertEquals(size1 + size2, fileStatus.getLen());

        // And we should have the same number of blocks
        assertEquals(f1NumBlocks + f2NumBlocks,
                fs.getFileBlockLocations(fileStatus.getPath(), 0, size1 + size2).length);
    } finally {
        if (cluster != null) {
            cluster.shutdown();
        }

    }
}

From source file:ca.sparkera.adapters.mapred.MainframeVBInputFormat.java

License:Apache License

/**
 * Splits files returned by {@link #listStatus(JobConf)} when they're too
 * big.//  www  .j  av a 2  s . c o m
 */
@Override
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    FileStatus[] files = listStatus(job);
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);
    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        FSDataInputStream fileIn;
        InputStream inputStream;
        fileIn = fs.open(path);
        inputStream = fileIn;
        filePosition = fileIn;
        long offset = 0;
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();

            long bytesRemaining = length;
            long splitSize = 0;
            while (offset < length) {
                splitSize = computeSplitSize(goalSize, minSize, blockSize, inputStream);

                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));

                bytesRemaining -= splitSize;
                offset = length - bytesRemaining;
            }

            if (bytesRemaining != 0) {
                throw new IOException(
                        "Partial record(length = " + bytesRemaining + ") found at the end of file " + path);
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
        if (inputStream != null) {
            inputStream.close();
            inputStream = null;
        }
    }
    java.util.Date date = new java.util.Date();
    System.out.println((new Timestamp(date.getTime())) + ",\t Split = 100%  Total Splits - " + (++splitCount)
            + "\t Total Records in VB file - " + totalRecords);

    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:co.cask.cdap.data.stream.StreamDataFileSplitter.java

License:Apache License

/**
 * Computes splits for the event file.//from   w ww. j  a v  a2 s . c  om
 */
<T> void computeSplits(FileSystem fs, long minSplitSize, long maxSplitSize, long startTime, long endTime,
        List<T> splits, StreamInputSplitFactory<T> splitFactory) throws IOException {

    // Compute the splits based on the min/max size
    Path eventFile = eventFileStatus.getPath();
    Path indexFile = getIndexFile(eventFile);

    BlockLocation[] blockLocations = fs.getFileBlockLocations(eventFile, 0, eventFileStatus.getLen());

    long length = eventFileStatus.getLen();
    long offset = 0;
    int blockIndex = 0;

    while (offset < length) {
        blockIndex = getBlockIndex(blockLocations, offset, blockIndex);
        String[] hosts = null;
        if (blockIndex >= 0) {
            hosts = blockLocations[blockIndex].getHosts();
        } else {
            blockIndex = 0;
        }

        long splitSize = computeSplitSize(eventFileStatus, offset, minSplitSize, maxSplitSize);
        splits.add(
                splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, splitSize, hosts));
        offset += splitSize;
    }

    // One extra split for the tail of the file.
    splits.add(
            splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, Long.MAX_VALUE, null));
}

From source file:com.asakusafw.runtime.directio.hadoop.BlockMap.java

License:Apache License

/**
 * Returns a list of {@link BlockInfo} for the target file.
 * @param fs the target file//from   w w  w  .ja  va  2  s. c om
 * @param status the target file status
 * @return the computed information
 * @throws IOException if failed to compute information
 */
public static List<BlockInfo> computeBlocks(FileSystem fs, FileStatus status) throws IOException {
    BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen());
    List<BlockInfo> results = new ArrayList<>();
    for (BlockLocation location : locations) {
        long length = location.getLength();
        long start = location.getOffset();
        results.add(new BlockInfo(start, start + length, location.getHosts()));
    }
    return results;
}

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///from  w ww.  j  a v a  2s. c o  m
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.warn("test go go go");

    FileStatus[] files = listStatus(job);

    // Save the number of input files in the job-conf
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(new FileSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *
 * @param job/* w ww .  j  av  a  2  s .c  om*/
 *        The current BSPJob job
 * @return input splits
 */
@Override
public List<InputSplit> getSplits(BSPJob job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConf());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = 0L;
            if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize;
                } else {
                    splitSize = job.getSplitSize();
                }
            } else {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                } else {
                    splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                }
            }
            LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB");
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.info("[Split Number] " + splits.size());
    return splits;
}