Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException 

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:org.terrier.structures.indexing.singlepass.hadoop.BitPostingIndexInputFormat.java

License:Mozilla Public License

/** 
 * {@inheritDoc} /*from w  ww .ja  v  a 2 s .  com*/
 */
@SuppressWarnings("unchecked")
/** Make the splits of the index structure. Bit structures split across multiple files are supported */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    HadoopUtility.loadTerrierJob(job);

    final String lookupStructureName = job.get(BITPOSTING_LOOKUP_STRUCTURE_KEY);
    final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY);
    Index.setIndexLoadingProfileAsRetrieval(false);
    final IndexOnDisk index = HadoopUtility.fromHConfiguration(job);

    final byte fileCount = Byte
            .parseByte(index.getIndexProperty("index." + bitPostingStructureName + ".data-files", "1"));
    final Path bitPostingStructureFiles[] = new Path[fileCount];
    final FileStatus[] fss = new FileStatus[fileCount];
    final long[] bitPostingStructureFSBlockSizes = new long[fileCount];

    logger.info("Calculating splits of structure " + bitPostingStructureName);
    FileSystem fs = FileSystem.get(job);
    for (byte i = 0; i < fileCount; i++) {
        bitPostingStructureFiles[i] = new Path(
                BitPostingIndexInputStream.getFilename(index, bitPostingStructureName, fileCount, i));
        fss[i] = fs.getFileStatus(bitPostingStructureFiles[i]);
        bitPostingStructureFSBlockSizes[i] = getBlockSize(bitPostingStructureFiles[i], fss[i]);
        logger.info("File " + i + " approx splits="
                + ((double) fss[i].getLen() / (double) bitPostingStructureFSBlockSizes[i]));
    }

    //this smells of a hack, because we dont have a strategy for naming various index structures streams
    final Iterator<? extends BitIndexPointer> offsetIterator = index
            .hasIndexStructureInputStream(lookupStructureName + "-entry")
                    ? (Iterator<? extends BitIndexPointer>) index
                            .getIndexStructureInputStream(lookupStructureName + "-entry")
                    : (Iterator<? extends BitIndexPointer>) index
                            .getIndexStructureInputStream(lookupStructureName);

    if (offsetIterator == null)
        throw new IOException("No such stream structure called " + lookupStructureName + "-entry or "
                + lookupStructureName + " found in index");
    final List<InputSplit> splitList = new ArrayList<InputSplit>();

    int currentId = 0;

    //size of the current split of each file
    final long[] blockSize = new long[fileCount];
    //location of the last split for each file
    final long[] bitPostingStructureSplitEndOffsets = new long[fileCount];

    //how many entries will be in this split, for each file
    final int[] entriesInBlock = new int[fileCount];
    //what is the starting id of the next entry split, for each file
    final int[] firstEntryOfNextSplit = new int[fileCount];

    //number of splits per file, for logging only
    final int[] splitsPerFile = new int[fileCount];

    Arrays.fill(firstEntryOfNextSplit, Integer.MAX_VALUE);

    BitIndexPointer currentPointer = null;
    //iterate through the lookup iterator
    //split the target bit posting index structure into chunks of size bitPostingStructureFSBlockSize
    while (offsetIterator.hasNext()) {
        //ok, where is the next pointer to
        currentPointer = offsetIterator.next();
        final byte fileId = currentPointer.getFileNumber();

        //what is the first entry of the next split of this file?
        firstEntryOfNextSplit[fileId] = Math.min(currentId, firstEntryOfNextSplit[fileId]);
        //this split will have one more entry
        entriesInBlock[fileId]++;

        //what is our current offset?
        long offset = currentPointer.getOffset();
        //System.err.println("Offset" + offset);
        //if we made the split here, how big would it be?
        blockSize[fileId] = offset - bitPostingStructureSplitEndOffsets[fileId];
        //is this block is large enough
        if (blockSize[fileId] > bitPostingStructureFSBlockSizes[fileId]) {
            //yes, its big enough
            //block will be from bitPostingStructureSplitEndOffsets[fileId] to offset, which is blockSize[fileId]
            BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId],
                    bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
            splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path
                    bitPostingStructureSplitEndOffsets[fileId], //start
                    blockSize[fileId], //length
                    blkLocations[0].getHosts(), //hosts
                    firstEntryOfNextSplit[fileId], //first entry in this split
                    entriesInBlock[fileId]) //number of entries in this split
            );
            logger.info("File " + fileId + " split " + (splitList.size() - 1) + " "
                    + splitList.get(splitList.size() - 1).toString());
            //record another split for this file (for logging only)
            splitsPerFile[fileId]++;
            //update recording of last offset for this file
            bitPostingStructureSplitEndOffsets[fileId] = offset;
            //reset size of split for this file
            blockSize[fileId] = 0;
            //reset counter of entries in split of this file
            entriesInBlock[fileId] = 0;
            //reset the first offset of this split
            firstEntryOfNextSplit[fileId] = Integer.MAX_VALUE;
        }

        //ids always increment
        currentId++;
    }
    IndexUtil.close(offsetIterator);
    //find any files which have trailing blocks
    for (byte fileId = 0; fileId < fileCount; fileId++) {
        if (entriesInBlock[fileId] == 0)
            continue;
        assert (firstEntryOfNextSplit[fileId] != Integer.MAX_VALUE);

        //block will be from bitPostingStructureSplitEndOffsets[fileId], with length blockSize[fileId]
        BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId],
                bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
        splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path of file for split
                bitPostingStructureSplitEndOffsets[fileId], //start offset of this split
                blockSize[fileId], //size of this split
                blkLocations[0].getHosts(), //hosts for this split
                firstEntryOfNextSplit[fileId], //first entry id for this split
                entriesInBlock[fileId]) //number of entries in this split
        );
        logger.info("File " + fileId + " trailing split " + (splitList.size() - 1) + " "
                + splitList.get(splitList.size() - 1).toString());

        //record another split for this file (for logging only)
        splitsPerFile[fileId]++;
    }

    logger.info("Split " + bitPostingStructureName + " (of " + currentId + " entries) into " + splitList.size()
            + " splits");
    if (fileCount > 1) {
        logger.info("Multiple files of " + bitPostingStructureName + " were split as follows: "
                + ArrayUtils.join(splitsPerFile, ","));
    }
    assert (splitList.size() > 0);
    index.close();
    return splitList.toArray(new InputSplit[splitList.size()]);
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java

License:Mozilla Public License

@SuppressWarnings("unchecked")
@Override/*from w  w w  .j a va2  s  . co m*/
/**
 * Splits the input collection into
 * sets of files where each Map task 
 * gets about the same number of files
 */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
        return new InputSplit[0];
    }

    if (numSplits > paths.length) {
        numSplits = paths.length;
    } else if (numSplits < 1) {
        numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(
            numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array
            .newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
        final FileStatus fss = fs.getFileStatus(paths[i]);
        lengths[i] = fss.getLen();
        final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>();
        final long normalblocksize = fss.getBlockSize();
        for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
            final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
            final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
            for (BlockLocation bl : blockLocations) {
                for (String host : bl.getHosts()) {
                    location2size.adjustOrPutValue(host, blocksize, blocksize);
                }
            }
        }
    }

    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
        /* caclulate split size for this task - usually numberOfFilesPerSplit, but
         * less than this for the last split */
        final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed
                : numberOfFilesPerSplit;
        //arrays of information for split
        Path[] splitPaths = new Path[splitSizeForThisSplit];
        long[] splitLengths = new long[splitSizeForThisSplit];
        long[] splitStarts = new long[splitSizeForThisSplit];
        final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
        String[] splitLocations = null; //final recommended locations for this split.
        for (int i = 0; i < splitSizeForThisSplit; i++) {
            locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() {
                public boolean execute(String a, long b) {
                    allLocationsForSplit.adjustOrPutValue(a, b, b);
                    return true;
                }
            });
            if (allLocationsForSplit.size() <= 3) {
                splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
            } else {
                String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
                Arrays.sort(hosts, new Comparator<String>() {
                    public int compare(String o1, String o2) {
                        long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                        if (diffamount > 0) {
                            return -1;
                        } else if (diffamount < 0) {
                            return 1;
                        }
                        return 0;
                    }
                });
                splitLocations = new String[3];
                System.arraycopy(hosts, 0, splitLocations, 0, 3);
            }
        }

        //copy information for this split
        System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
        System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
        //count the number of paths consumed
        pathsUsed += splitSizeForThisSplit;

        //make the actual split object
        //logger.info("New split of size " + splitSizeForThisSplit);
        mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
        splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
        splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
        throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
}

From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();/*from   ww w .j av  a2  s.  c o  m*/
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        SplitCompressionInputStream is = in.getSplitCompressionInputStream();
        long start = 0;
        long skip = 0;
        if (is != null) {
            start = is.getAdjustedStart();
            length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                SplitCompressionInputStream cin = in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in);

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        //Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
    }
    return splits;
}

From source file:parquet.hadoop.ParquetInputFormat.java

License:Apache License

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers//from ww  w.jav  a2  s  . com
 * @throws IOException
 */
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
    final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
    final long minSplitSize = Math.max(getFormatMinSplitSize(),
            configuration.getLong("mapred.min.split.size", 0L));
    if (maxSplitSize < 0 || minSplitSize < 0) {
        throw new ParquetDecodingException("maxSplitSize or minSplitSie should not be negative: maxSplitSize = "
                + maxSplitSize + "; minSplitSize = " + minSplitSize);
    }
    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
    GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers,
            configuration.getBoolean(STRICT_TYPE_CHECKING, true));
    ReadContext readContext = getReadSupport(configuration).init(
            new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema()));
    for (Footer footer : footers) {
        final Path file = footer.getFile();
        LOG.debug(file);
        FileSystem fs = file.getFileSystem(configuration);
        FileStatus fileStatus = fs.getFileStatus(file);
        ParquetMetadata parquetMetaData = footer.getParquetMetadata();
        List<BlockMetaData> blocks = parquetMetaData.getBlocks();
        BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        splits.addAll(generateSplits(blocks, fileBlockLocations, fileStatus, parquetMetaData.getFileMetaData(),
                readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize,
                maxSplitSize));
    }
    return splits;
}

From source file:StorageEngineClient.ColumnStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;/* w  ww. j ava2s .  co m*/
    List<FormatStorageInputSplit> splits = new ArrayList<FormatStorageInputSplit>();
    HashMap<String, FileStatus> files = new HashMap<String, FileStatus>();
    for (FileStatus file : listStatus(job)) {
        String filestr = file.getPath().toString();
        String filekey = filestr.substring(0, filestr.lastIndexOf("_idx"));
        if (!files.containsKey(filekey)) {
            files.put(filekey, file);
        } else {
            if (file.getLen() > files.get(filekey).getLen()) {
                files.put(filekey, file);
            }
        }
    }

    for (String filekey : files.keySet()) {
        FileStatus file = files.get(filekey);
        Path path = file.getPath();
        Path keypath = new Path(filekey);
        tmpPath = keypath;

        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length == 0) {
            continue;
        }
        if (blkLocations.length == 1) {
            FormatStorageInputSplit split = new FormatStorageInputSplit(keypath, length,
                    blkLocations[0].getHosts());
            splits.add(split);
        } else {
            String filename = path.toString();

            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();

            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                FormatStorageInputSplit split = new FormatStorageInputSplit(keypath, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }
    }

    if (splits.size() == 0) {
        splits.add(new FormatStorageInputSplit(tmpPath, 0, 0, 0, new String[0]));
        return splits.toArray(new FormatStorageInputSplit[splits.size()]);
    }

    System.out.println("Total # of splits: " + splits.size());
    return splits.toArray(new FormatStorageInputSplit[splits.size()]);
}

From source file:StorageEngineClient.CombineFileInputFormat.java

License:Open Source License

private void getMoreSplitsWithStatus(JobConf job, Path[] paths1, Map<String, FileStatus> fileNameToStatus,
        long maxSize, long minSizeNode, long minSizeRack, List<CombineFileSplit> splits)
        throws IOException, NullGzFileException {
    if (paths1.length == 0) {
        return;/*from w ww.  j  a va  2  s.co m*/
    }

    Path[] paths = paths1;
    ArrayList<Path> splitable = new ArrayList<Path>();
    ArrayList<Path> unsplitable = new ArrayList<Path>();
    for (int i = 0; i < paths1.length; i++) {
        if (isSplitable(paths1[i].getFileSystem(job), paths1[i])) {
            splitable.add(paths1[i]);
        } else {
            unsplitable.add(paths1[i]);
        }
    }
    if (unsplitable.size() != 0) {
        paths = new Path[splitable.size()];
        splitable.toArray(paths);
    }

    OneFileInfo[] files;

    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();

    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();

    HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>();

    files = new OneFileInfo[paths.length];

    long totLength = 0;
    for (int i = 0; i < paths.length; i++) {
        files[i] = new OneFileInfo(paths[i], fileNameToStatus.get(paths[i].toString()), job, rackToBlocks,
                blockToNodes, nodeToBlocks);
        totLength += files[i].getLength();
    }

    for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter
            .hasNext();) {

        Map.Entry<String, List<OneBlockInfo>> onenode = iter.next();
        this.processsplit(job, onenode, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "node");
    }

    for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter
            .hasNext();) {

        Map.Entry<String, List<OneBlockInfo>> onerack = iter.next();
        this.processsplit(job, onerack, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "rack");
    }

    this.processsplit(job, null, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "all");

    int maxFileNumPerSplit = job.getInt("hive.merge.inputfiles.maxFileNumPerSplit", 1000);

    HashSet<OneBlockInfo> hs = new HashSet<OneBlockInfo>();
    while (blockToNodes.size() > 0) {
        ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
        List<String> nodes = new ArrayList<String>();
        int filenum = 0;
        hs.clear();
        for (OneBlockInfo blockInfo : blockToNodes.keySet()) {
            validBlocks.add(blockInfo);
            filenum++;
            for (String host : blockInfo.hosts) {
                nodes.add(host);
            }
            hs.add(blockInfo);
            if (filenum >= maxFileNumPerSplit) {
                break;
            }
        }
        for (OneBlockInfo blockInfo : hs) {
            blockToNodes.remove(blockInfo);
        }
        this.addCreatedSplit(job, splits, nodes, validBlocks);
    }

    if (unsplitable.size() != 0) {

        HashMap<OneBlockInfo, String[]> fileToNodes = new HashMap<OneBlockInfo, String[]>();

        for (Path path : unsplitable) {
            FileSystem fs = path.getFileSystem(job);
            FileStatus stat = fileNameToStatus.get(path.toString());//fs.getFileStatus(path);
            long len = stat.getLen();
            BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, len);
            if (locations.length == 0) {
                console.printError("The file " + path.toUri().toString() + " maybe is empty, please check it!");
                throw new NullGzFileException(
                        "The file " + path.toUri().toString() + " maybe is empty, please check it!");
            }

            LOG.info("unsplitable file:" + path.toUri().toString() + " length:" + len);

            OneBlockInfo oneblock = new OneBlockInfo(path, 0, len, locations[0].getHosts(),
                    locations[0].getTopologyPaths());
            fileToNodes.put(oneblock, locations[0].getHosts());
        }

        this.processsplitForUnsplit(job, null, fileToNodes, maxSize, minSizeNode, minSizeRack, splits, "all");
    }
}

From source file:StorageEngineClient.FormatStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;// w w  w  . j ava 2 s.  co m
    List<FormatStorageInputSplit> splits = new ArrayList<FormatStorageInputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        tmpPath = path;
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if (blkLocations.length == 0) {
            continue;
        } else if (blkLocations.length == 1) {
            FormatStorageInputSplit split = new FormatStorageInputSplit(path, length,
                    blkLocations[0].getHosts());
            splits.add(split);
        } else {
            String filename = path.toString();
            IFormatDataFile ifdf = new IFormatDataFile(job);
            ifdf.open(filename);

            ISegmentIndex segmentIndex = ifdf.segIndex();
            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                blkLocations = fs.getFileBlockLocations(file, segmentIndex.getSegOffset(i),
                        segmentIndex.getseglen(i));

                FormatStorageInputSplit split = new FormatStorageInputSplit(path, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[0].getHosts());
                splits.add(split);
            }
            ifdf.close();
        }
    }

    if (splits.size() == 0) {
        splits.add(new FormatStorageInputSplit(tmpPath, 0, 0, 0, new String[0]));
    }

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new FormatStorageInputSplit[splits.size()]);
}

From source file:StorageEngineClient.HashMultiFileColumnStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    HashMap<String, FileStatus> files = new HashMap<String, FileStatus>();
    for (FileStatus file : listStatus(job)) {
        String filestr = file.getPath().toString();
        String filekey = filestr.substring(0, filestr.lastIndexOf("_idx"));
        if (!files.containsKey(filekey)) {
            files.put(filekey, file);/*from  w  ww  .j a  v a2 s  .  c  om*/
        } else {
            if (file.getLen() > files.get(filekey).getLen()) {
                files.put(filekey, file);
            }
        }
    }

    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);

    List<Path> paths = new ArrayList<Path>();
    int count = 0;
    BlockLocation[] blkLocations = null;
    for (String filekey : files.keySet()) {
        FileStatus file = files.get(filekey);
        Path path = file.getPath();

        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        if (count == 0) {
            blkLocations = fs.getFileBlockLocations(file, 0, length);
            count++;
        }

        paths.add(new Path(filekey));
    }

    if (paths.size() == 0 || blkLocations == null) {
        splits.add(new MultiFormatStorageSplit(new Path[0], new String[0]));
        return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
    }

    int blkIndex = getBlockIndex(blkLocations, 0);
    MultiFormatStorageSplit split = new MultiFormatStorageSplit(paths.toArray(new Path[paths.size()]),
            blkLocations[blkIndex].getHosts());
    splits.add(split);

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
}

From source file:StorageEngineClient.HashMultiFileFormatStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);

    List<Path> paths = new ArrayList<Path>(10);
    int count = 0;
    BlockLocation[] blkLocations = null;
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();

        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        if (count == 0) {
            blkLocations = fs.getFileBlockLocations(file, 0, length);
            count++;//w w w . jav  a 2s  .c o  m
        }

        paths.add(path);
    }

    if (paths.size() == 0 || blkLocations == null) {
        splits.add(new MultiFormatStorageSplit(new Path[0], new String[0]));
        return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
    }

    int blkIndex = getBlockIndex(blkLocations, 0);
    MultiFormatStorageSplit split = new MultiFormatStorageSplit(paths.toArray(new Path[paths.size()]),
            blkLocations[blkIndex].getHosts());
    splits.add(split);

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
}

From source file:wiki.hadoop.mapred.lib.input.StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();/*from   w  w w .  j  a  v a2s .c o  m*/
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        SplitCompressionInputStream is = in.getSplitCompressionInputStream();
        long start = 0;
        long skip = 0;
        if (is != null) {
            start = is.getAdjustedStart();
            length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                SplitCompressionInputStream cin = in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in, split.getStart());

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
    return splits;
}