Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException 

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:PageInputFormat.java

License:Apache License

public InputSplit[] getSplits(JobConf job, int num) throws IOException {
    long minSize = 1;
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            FileSystem fs = path.getFileSystem(job);
            blkLocations = fs.getFileBlockLocations(file, 0, length);
            if (isSplitable(path.getFileSystem(job), path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }//from ww w.j a v a  2 s.  c  o  m

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
        } else
            splits.add(makeSplit(path, 0, length, new String[0]));
    }
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    return splits.toArray(new InputSplit[0]);

}

From source file:StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();//ww w  . j  a v  a 2s. com
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        InputStream is = null;
        long start = 0;
        long skip = 0;
        if (is != null) {
            // start = is.getAdjustedStart();
            // length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                // SplitCompressionInputStream cin =
                // in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in);

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        // Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
    }
    return splits;
}

From source file:DupleInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context// ww  w .  j  av a  2  s .c o  m
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    // times that each file exists in the files List
    ArrayList<Integer> times = new ArrayList<Integer>();
    ArrayList<Path> paths = new ArrayList<Path>();

    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

            int index;
            if ((index = paths.indexOf(path)) != -1)
                times.set(index, times.get(index) + 1);
            else {
                times.add(0);
                paths.add(path);
                index = times.size() - 1;
            }

            // not splitable
            splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), times.get(index)));

        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    //LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:HDFSFileFinder.java

License:Apache License

private static void getBlockLocationsFromHdfs() {
    StringBuilder sb = new StringBuilder();
    Configuration conf = new Configuration();
    boolean first = true;

    // make connection to hdfs
    try {//from  www  . j  a  v  a  2  s  . c  o m
        if (verbose) {
            writer.println("DEBUG: Trying to connect to " + fsName);
        }
        FileSystem fs = FileSystem.get(conf);
        Path file = new Path(fileName);
        FileStatus fStatus = fs.getFileStatus(file);
        status = fStatus;
        bLocations = fs.getFileBlockLocations(status, 0, status.getLen());
        //print out all block locations
        for (BlockLocation aLocation : bLocations) {
            String[] names = aLocation.getHosts();
            for (String name : names) {
                InetAddress addr = InetAddress.getByName(name);
                String host = addr.getHostName();
                int idx = host.indexOf('.');
                String hostname;
                if (0 < idx) {
                    hostname = host.substring(0, host.indexOf('.'));
                } else {
                    hostname = host;
                }
                if (first) {
                    sb.append(hostname);
                    first = false;
                } else {
                    sb.append(",").append(hostname);
                }
            }
        }
        sb.append(NEWLINE);
    } catch (IOException e) {
        writer.println("Error getting block location data from namenode");
        e.printStackTrace();
    }
    writer.print(sb.toString());
    writer.flush();
}

From source file:a.TestConcatExample.java

License:Apache License

@Test
public void concatIsPermissive() throws IOException, URISyntaxException {
    MiniDFSCluster cluster = null;/*from w w w  .j  a  va  2  s.com*/
    final Configuration conf = WebHdfsTestUtil.createConf();
    conf.set("dfs.namenode.fs-limits.min-block-size", "1000"); // Allow tiny blocks for the test
    try {
        cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build();
        cluster.waitActive();
        final FileSystem webHdfs = WebHdfsTestUtil.getWebHdfsFileSystem(conf, WebHdfsFileSystem.SCHEME);
        final FileSystem dfs = cluster.getFileSystem();

        final FileSystem fs = dfs; // WebHDFS has a bug in getLocatedBlocks

        Path root = new Path("/dir");
        fs.mkdirs(root);

        short origRep = 3;
        short secondRep = (short) (origRep - 1);
        Path f1 = new Path("/dir/f1");
        long size1 = writeFile(fs, f1, /* blocksize */ 4096, origRep, 5);
        long f1NumBlocks = fs.getFileBlockLocations(f1, 0, size1).length;
        assertEquals(5, f1NumBlocks);

        Path f2 = new Path("/dir/f2");
        long size2 = writeFile(fs, f2, /* blocksize (must divide 512 for checksum) */ 4096 - 512, secondRep, 4);
        long f2NumBlocks = fs.getFileBlockLocations(f2, 0, size2).length;
        assertEquals(5, f2NumBlocks);

        fs.concat(f1, new Path[] { f2 });
        FileStatus[] fileStatuses = fs.listStatus(root);

        // Only one file should remain
        assertEquals(1, fileStatuses.length);
        FileStatus fileStatus = fileStatuses[0];

        // And it should be named after the first file
        assertEquals("f1", fileStatus.getPath().getName());

        // The entire file takes the replication of the first argument
        assertEquals(origRep, fileStatus.getReplication());

        // As expected, the new concated file is the length of both the previous files
        assertEquals(size1 + size2, fileStatus.getLen());

        // And we should have the same number of blocks
        assertEquals(f1NumBlocks + f2NumBlocks,
                fs.getFileBlockLocations(fileStatus.getPath(), 0, size1 + size2).length);
    } finally {
        if (cluster != null) {
            cluster.shutdown();
        }

    }
}

From source file:ca.sparkera.adapters.mapred.MainframeVBInputFormat.java

License:Apache License

/**
 * Splits files returned by {@link #listStatus(JobConf)} when they're too
 * big.//  www  .j  av a 2  s . c o m
 */
@Override
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    FileStatus[] files = listStatus(job);
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);
    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        FSDataInputStream fileIn;
        InputStream inputStream;
        fileIn = fs.open(path);
        inputStream = fileIn;
        filePosition = fileIn;
        long offset = 0;
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();

            long bytesRemaining = length;
            long splitSize = 0;
            while (offset < length) {
                splitSize = computeSplitSize(goalSize, minSize, blockSize, inputStream);

                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));

                bytesRemaining -= splitSize;
                offset = length - bytesRemaining;
            }

            if (bytesRemaining != 0) {
                throw new IOException(
                        "Partial record(length = " + bytesRemaining + ") found at the end of file " + path);
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
        if (inputStream != null) {
            inputStream.close();
            inputStream = null;
        }
    }
    java.util.Date date = new java.util.Date();
    System.out.println((new Timestamp(date.getTime())) + ",\t Split = 100%  Total Splits - " + (++splitCount)
            + "\t Total Records in VB file - " + totalRecords);

    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:co.cask.cdap.data.stream.StreamDataFileSplitter.java

License:Apache License

/**
 * Computes splits for the event file.//from   w ww. j  a v  a2 s . c  om
 */
<T> void computeSplits(FileSystem fs, long minSplitSize, long maxSplitSize, long startTime, long endTime,
        List<T> splits, StreamInputSplitFactory<T> splitFactory) throws IOException {

    // Compute the splits based on the min/max size
    Path eventFile = eventFileStatus.getPath();
    Path indexFile = getIndexFile(eventFile);

    BlockLocation[] blockLocations = fs.getFileBlockLocations(eventFile, 0, eventFileStatus.getLen());

    long length = eventFileStatus.getLen();
    long offset = 0;
    int blockIndex = 0;

    while (offset < length) {
        blockIndex = getBlockIndex(blockLocations, offset, blockIndex);
        String[] hosts = null;
        if (blockIndex >= 0) {
            hosts = blockLocations[blockIndex].getHosts();
        } else {
            blockIndex = 0;
        }

        long splitSize = computeSplitSize(eventFileStatus, offset, minSplitSize, maxSplitSize);
        splits.add(
                splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, splitSize, hosts));
        offset += splitSize;
    }

    // One extra split for the tail of the file.
    splits.add(
            splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, Long.MAX_VALUE, null));
}

From source file:com.asakusafw.runtime.directio.hadoop.BlockMap.java

License:Apache License

/**
 * Returns a list of {@link BlockInfo} for the target file.
 * @param fs the target file//from   w w  w  .ja  va  2  s. c om
 * @param status the target file status
 * @return the computed information
 * @throws IOException if failed to compute information
 */
public static List<BlockInfo> computeBlocks(FileSystem fs, FileStatus status) throws IOException {
    BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen());
    List<BlockInfo> results = new ArrayList<>();
    for (BlockLocation location : locations) {
        long length = location.getLength();
        long start = location.getOffset();
        results.add(new BlockInfo(start, start + length, location.getHosts()));
    }
    return results;
}

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///from  w ww.  j  a v a  2s. c o  m
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.warn("test go go go");

    FileStatus[] files = listStatus(job);

    // Save the number of input files in the job-conf
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(new FileSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *
 * @param job/* w ww .  j  av  a  2  s .c  om*/
 *        The current BSPJob job
 * @return input splits
 */
@Override
public List<InputSplit> getSplits(BSPJob job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConf());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = 0L;
            if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize;
                } else {
                    splitSize = job.getSplitSize();
                }
            } else {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                } else {
                    splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                }
            }
            LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB");
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.info("[Split Number] " + splits.size());
    return splits;
}