Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:org.terrier.structures.indexing.singlepass.hadoop.BitPostingIndexInputFormat.java

License:Mozilla Public License

/** 
 * {@inheritDoc} /*from w  ww .ja  v  a 2 s .  com*/
 */
@SuppressWarnings("unchecked")
/** Make the splits of the index structure. Bit structures split across multiple files are supported */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    HadoopUtility.loadTerrierJob(job);

    final String lookupStructureName = job.get(BITPOSTING_LOOKUP_STRUCTURE_KEY);
    final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY);
    Index.setIndexLoadingProfileAsRetrieval(false);
    final IndexOnDisk index = HadoopUtility.fromHConfiguration(job);

    final byte fileCount = Byte
            .parseByte(index.getIndexProperty("index." + bitPostingStructureName + ".data-files", "1"));
    final Path bitPostingStructureFiles[] = new Path[fileCount];
    final FileStatus[] fss = new FileStatus[fileCount];
    final long[] bitPostingStructureFSBlockSizes = new long[fileCount];

    logger.info("Calculating splits of structure " + bitPostingStructureName);
    FileSystem fs = FileSystem.get(job);
    for (byte i = 0; i < fileCount; i++) {
        bitPostingStructureFiles[i] = new Path(
                BitPostingIndexInputStream.getFilename(index, bitPostingStructureName, fileCount, i));
        fss[i] = fs.getFileStatus(bitPostingStructureFiles[i]);
        bitPostingStructureFSBlockSizes[i] = getBlockSize(bitPostingStructureFiles[i], fss[i]);
        logger.info("File " + i + " approx splits="
                + ((double) fss[i].getLen() / (double) bitPostingStructureFSBlockSizes[i]));
    }

    //this smells of a hack, because we dont have a strategy for naming various index structures streams
    final Iterator<? extends BitIndexPointer> offsetIterator = index
            .hasIndexStructureInputStream(lookupStructureName + "-entry")
                    ? (Iterator<? extends BitIndexPointer>) index
                            .getIndexStructureInputStream(lookupStructureName + "-entry")
                    : (Iterator<? extends BitIndexPointer>) index
                            .getIndexStructureInputStream(lookupStructureName);

    if (offsetIterator == null)
        throw new IOException("No such stream structure called " + lookupStructureName + "-entry or "
                + lookupStructureName + " found in index");
    final List<InputSplit> splitList = new ArrayList<InputSplit>();

    int currentId = 0;

    //size of the current split of each file
    final long[] blockSize = new long[fileCount];
    //location of the last split for each file
    final long[] bitPostingStructureSplitEndOffsets = new long[fileCount];

    //how many entries will be in this split, for each file
    final int[] entriesInBlock = new int[fileCount];
    //what is the starting id of the next entry split, for each file
    final int[] firstEntryOfNextSplit = new int[fileCount];

    //number of splits per file, for logging only
    final int[] splitsPerFile = new int[fileCount];

    Arrays.fill(firstEntryOfNextSplit, Integer.MAX_VALUE);

    BitIndexPointer currentPointer = null;
    //iterate through the lookup iterator
    //split the target bit posting index structure into chunks of size bitPostingStructureFSBlockSize
    while (offsetIterator.hasNext()) {
        //ok, where is the next pointer to
        currentPointer = offsetIterator.next();
        final byte fileId = currentPointer.getFileNumber();

        //what is the first entry of the next split of this file?
        firstEntryOfNextSplit[fileId] = Math.min(currentId, firstEntryOfNextSplit[fileId]);
        //this split will have one more entry
        entriesInBlock[fileId]++;

        //what is our current offset?
        long offset = currentPointer.getOffset();
        //System.err.println("Offset" + offset);
        //if we made the split here, how big would it be?
        blockSize[fileId] = offset - bitPostingStructureSplitEndOffsets[fileId];
        //is this block is large enough
        if (blockSize[fileId] > bitPostingStructureFSBlockSizes[fileId]) {
            //yes, its big enough
            //block will be from bitPostingStructureSplitEndOffsets[fileId] to offset, which is blockSize[fileId]
            BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId],
                    bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
            splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path
                    bitPostingStructureSplitEndOffsets[fileId], //start
                    blockSize[fileId], //length
                    blkLocations[0].getHosts(), //hosts
                    firstEntryOfNextSplit[fileId], //first entry in this split
                    entriesInBlock[fileId]) //number of entries in this split
            );
            logger.info("File " + fileId + " split " + (splitList.size() - 1) + " "
                    + splitList.get(splitList.size() - 1).toString());
            //record another split for this file (for logging only)
            splitsPerFile[fileId]++;
            //update recording of last offset for this file
            bitPostingStructureSplitEndOffsets[fileId] = offset;
            //reset size of split for this file
            blockSize[fileId] = 0;
            //reset counter of entries in split of this file
            entriesInBlock[fileId] = 0;
            //reset the first offset of this split
            firstEntryOfNextSplit[fileId] = Integer.MAX_VALUE;
        }

        //ids always increment
        currentId++;
    }
    IndexUtil.close(offsetIterator);
    //find any files which have trailing blocks
    for (byte fileId = 0; fileId < fileCount; fileId++) {
        if (entriesInBlock[fileId] == 0)
            continue;
        assert (firstEntryOfNextSplit[fileId] != Integer.MAX_VALUE);

        //block will be from bitPostingStructureSplitEndOffsets[fileId], with length blockSize[fileId]
        BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId],
                bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
        splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path of file for split
                bitPostingStructureSplitEndOffsets[fileId], //start offset of this split
                blockSize[fileId], //size of this split
                blkLocations[0].getHosts(), //hosts for this split
                firstEntryOfNextSplit[fileId], //first entry id for this split
                entriesInBlock[fileId]) //number of entries in this split
        );
        logger.info("File " + fileId + " trailing split " + (splitList.size() - 1) + " "
                + splitList.get(splitList.size() - 1).toString());

        //record another split for this file (for logging only)
        splitsPerFile[fileId]++;
    }

    logger.info("Split " + bitPostingStructureName + " (of " + currentId + " entries) into " + splitList.size()
            + " splits");
    if (fileCount > 1) {
        logger.info("Multiple files of " + bitPostingStructureName + " were split as follows: "
                + ArrayUtils.join(splitsPerFile, ","));
    }
    assert (splitList.size() > 0);
    index.close();
    return splitList.toArray(new InputSplit[splitList.size()]);
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java

License:Mozilla Public License

@SuppressWarnings("unchecked")
@Override/*from w  w w  .j a va2  s  . co m*/
/**
 * Splits the input collection into
 * sets of files where each Map task 
 * gets about the same number of files
 */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
        return new InputSplit[0];
    }

    if (numSplits > paths.length) {
        numSplits = paths.length;
    } else if (numSplits < 1) {
        numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(
            numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array
            .newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
        final FileStatus fss = fs.getFileStatus(paths[i]);
        lengths[i] = fss.getLen();
        final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>();
        final long normalblocksize = fss.getBlockSize();
        for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
            final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
            final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
            for (BlockLocation bl : blockLocations) {
                for (String host : bl.getHosts()) {
                    location2size.adjustOrPutValue(host, blocksize, blocksize);
                }
            }
        }
    }

    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
        /* caclulate split size for this task - usually numberOfFilesPerSplit, but
         * less than this for the last split */
        final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed
                : numberOfFilesPerSplit;
        //arrays of information for split
        Path[] splitPaths = new Path[splitSizeForThisSplit];
        long[] splitLengths = new long[splitSizeForThisSplit];
        long[] splitStarts = new long[splitSizeForThisSplit];
        final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
        String[] splitLocations = null; //final recommended locations for this split.
        for (int i = 0; i < splitSizeForThisSplit; i++) {
            locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() {
                public boolean execute(String a, long b) {
                    allLocationsForSplit.adjustOrPutValue(a, b, b);
                    return true;
                }
            });
            if (allLocationsForSplit.size() <= 3) {
                splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
            } else {
                String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
                Arrays.sort(hosts, new Comparator<String>() {
                    public int compare(String o1, String o2) {
                        long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                        if (diffamount > 0) {
                            return -1;
                        } else if (diffamount < 0) {
                            return 1;
                        }
                        return 0;
                    }
                });
                splitLocations = new String[3];
                System.arraycopy(hosts, 0, splitLocations, 0, 3);
            }
        }

        //copy information for this split
        System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
        System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
        //count the number of paths consumed
        pathsUsed += splitSizeForThisSplit;

        //make the actual split object
        //logger.info("New split of size " + splitSizeForThisSplit);
        mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
        splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
        splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
        throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
}

From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();/*from   ww w .j av  a2  s.  c o  m*/
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        SplitCompressionInputStream is = in.getSplitCompressionInputStream();
        long start = 0;
        long skip = 0;
        if (is != null) {
            start = is.getAdjustedStart();
            length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                SplitCompressionInputStream cin = in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in);

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        //Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
    }
    return splits;
}

From source file:parquet.hadoop.ParquetInputFormat.java

License:Apache License

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers//from ww  w.jav  a2  s  . com
 * @throws IOException
 */
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
    final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
    final long minSplitSize = Math.max(getFormatMinSplitSize(),
            configuration.getLong("mapred.min.split.size", 0L));
    if (maxSplitSize < 0 || minSplitSize < 0) {
        throw new ParquetDecodingException("maxSplitSize or minSplitSie should not be negative: maxSplitSize = "
                + maxSplitSize + "; minSplitSize = " + minSplitSize);
    }
    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
    GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers,
            configuration.getBoolean(STRICT_TYPE_CHECKING, true));
    ReadContext readContext = getReadSupport(configuration).init(
            new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema()));
    for (Footer footer : footers) {
        final Path file = footer.getFile();
        LOG.debug(file);
        FileSystem fs = file.getFileSystem(configuration);
        FileStatus fileStatus = fs.getFileStatus(file);
        ParquetMetadata parquetMetaData = footer.getParquetMetadata();
        List<BlockMetaData> blocks = parquetMetaData.getBlocks();
        BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        splits.addAll(generateSplits(blocks, fileBlockLocations, fileStatus, parquetMetaData.getFileMetaData(),
                readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize,
                maxSplitSize));
    }
    return splits;
}

From source file:StorageEngineClient.ColumnStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;/* w  ww. j ava2s .  co m*/
    List<FormatStorageInputSplit> splits = new ArrayList<FormatStorageInputSplit>();
    HashMap<String, FileStatus> files = new HashMap<String, FileStatus>();
    for (FileStatus file : listStatus(job)) {
        String filestr = file.getPath().toString();
        String filekey = filestr.substring(0, filestr.lastIndexOf("_idx"));
        if (!files.containsKey(filekey)) {
            files.put(filekey, file);
        } else {
            if (file.getLen() > files.get(filekey).getLen()) {
                files.put(filekey, file);
            }
        }
    }

    for (String filekey : files.keySet()) {
        FileStatus file = files.get(filekey);
        Path path = file.getPath();
        Path keypath = new Path(filekey);
        tmpPath = keypath;

        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        if (blkLocations.length == 0) {
            continue;
        }
        if (blkLocations.length == 1) {
            FormatStorageInputSplit split = new FormatStorageInputSplit(keypath, length,
                    blkLocations[0].getHosts());
            splits.add(split);
        } else {
            String filename = path.toString();

            IFormatDataFile ifd = new IFormatDataFile(job);
            ifd.open(filename);

            ISegmentIndex segmentIndex = ifd.segIndex();

            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                FormatStorageInputSplit split = new FormatStorageInputSplit(keypath, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[i].getHosts());
                splits.add(split);
            }

            ifd.close();
        }
    }

    if (splits.size() == 0) {
        splits.add(new FormatStorageInputSplit(tmpPath, 0, 0, 0, new String[0]));
        return splits.toArray(new FormatStorageInputSplit[splits.size()]);
    }

    System.out.println("Total # of splits: " + splits.size());
    return splits.toArray(new FormatStorageInputSplit[splits.size()]);
}

From source file:StorageEngineClient.CombineFileInputFormat.java

License:Open Source License

private void getMoreSplitsWithStatus(JobConf job, Path[] paths1, Map<String, FileStatus> fileNameToStatus,
        long maxSize, long minSizeNode, long minSizeRack, List<CombineFileSplit> splits)
        throws IOException, NullGzFileException {
    if (paths1.length == 0) {
        return;/*from w ww.  j  a va  2  s.co m*/
    }

    Path[] paths = paths1;
    ArrayList<Path> splitable = new ArrayList<Path>();
    ArrayList<Path> unsplitable = new ArrayList<Path>();
    for (int i = 0; i < paths1.length; i++) {
        if (isSplitable(paths1[i].getFileSystem(job), paths1[i])) {
            splitable.add(paths1[i]);
        } else {
            unsplitable.add(paths1[i]);
        }
    }
    if (unsplitable.size() != 0) {
        paths = new Path[splitable.size()];
        splitable.toArray(paths);
    }

    OneFileInfo[] files;

    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();

    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();

    HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>();

    files = new OneFileInfo[paths.length];

    long totLength = 0;
    for (int i = 0; i < paths.length; i++) {
        files[i] = new OneFileInfo(paths[i], fileNameToStatus.get(paths[i].toString()), job, rackToBlocks,
                blockToNodes, nodeToBlocks);
        totLength += files[i].getLength();
    }

    for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter
            .hasNext();) {

        Map.Entry<String, List<OneBlockInfo>> onenode = iter.next();
        this.processsplit(job, onenode, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "node");
    }

    for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter
            .hasNext();) {

        Map.Entry<String, List<OneBlockInfo>> onerack = iter.next();
        this.processsplit(job, onerack, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "rack");
    }

    this.processsplit(job, null, blockToNodes, maxSize, minSizeNode, minSizeRack, splits, "all");

    int maxFileNumPerSplit = job.getInt("hive.merge.inputfiles.maxFileNumPerSplit", 1000);

    HashSet<OneBlockInfo> hs = new HashSet<OneBlockInfo>();
    while (blockToNodes.size() > 0) {
        ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
        List<String> nodes = new ArrayList<String>();
        int filenum = 0;
        hs.clear();
        for (OneBlockInfo blockInfo : blockToNodes.keySet()) {
            validBlocks.add(blockInfo);
            filenum++;
            for (String host : blockInfo.hosts) {
                nodes.add(host);
            }
            hs.add(blockInfo);
            if (filenum >= maxFileNumPerSplit) {
                break;
            }
        }
        for (OneBlockInfo blockInfo : hs) {
            blockToNodes.remove(blockInfo);
        }
        this.addCreatedSplit(job, splits, nodes, validBlocks);
    }

    if (unsplitable.size() != 0) {

        HashMap<OneBlockInfo, String[]> fileToNodes = new HashMap<OneBlockInfo, String[]>();

        for (Path path : unsplitable) {
            FileSystem fs = path.getFileSystem(job);
            FileStatus stat = fileNameToStatus.get(path.toString());//fs.getFileStatus(path);
            long len = stat.getLen();
            BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, len);
            if (locations.length == 0) {
                console.printError("The file " + path.toUri().toString() + " maybe is empty, please check it!");
                throw new NullGzFileException(
                        "The file " + path.toUri().toString() + " maybe is empty, please check it!");
            }

            LOG.info("unsplitable file:" + path.toUri().toString() + " length:" + len);

            OneBlockInfo oneblock = new OneBlockInfo(path, 0, len, locations[0].getHosts(),
                    locations[0].getTopologyPaths());
            fileToNodes.put(oneblock, locations[0].getHosts());
        }

        this.processsplitForUnsplit(job, null, fileToNodes, maxSize, minSizeNode, minSizeRack, splits, "all");
    }
}

From source file:StorageEngineClient.FormatStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path tmpPath = null;// w w  w  . j ava 2 s.  co m
    List<FormatStorageInputSplit> splits = new ArrayList<FormatStorageInputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        tmpPath = path;
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if (blkLocations.length == 0) {
            continue;
        } else if (blkLocations.length == 1) {
            FormatStorageInputSplit split = new FormatStorageInputSplit(path, length,
                    blkLocations[0].getHosts());
            splits.add(split);
        } else {
            String filename = path.toString();
            IFormatDataFile ifdf = new IFormatDataFile(job);
            ifdf.open(filename);

            ISegmentIndex segmentIndex = ifdf.segIndex();
            for (int i = 0; i < segmentIndex.getSegnum(); i++) {
                blkLocations = fs.getFileBlockLocations(file, segmentIndex.getSegOffset(i),
                        segmentIndex.getseglen(i));

                FormatStorageInputSplit split = new FormatStorageInputSplit(path, segmentIndex.getseglen(i),
                        segmentIndex.getILineIndex(i).beginline(),
                        segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1,
                        blkLocations[0].getHosts());
                splits.add(split);
            }
            ifdf.close();
        }
    }

    if (splits.size() == 0) {
        splits.add(new FormatStorageInputSplit(tmpPath, 0, 0, 0, new String[0]));
    }

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new FormatStorageInputSplit[splits.size()]);
}

From source file:StorageEngineClient.HashMultiFileColumnStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    HashMap<String, FileStatus> files = new HashMap<String, FileStatus>();
    for (FileStatus file : listStatus(job)) {
        String filestr = file.getPath().toString();
        String filekey = filestr.substring(0, filestr.lastIndexOf("_idx"));
        if (!files.containsKey(filekey)) {
            files.put(filekey, file);/*from  w  ww  .j a  v a2 s  .  c  om*/
        } else {
            if (file.getLen() > files.get(filekey).getLen()) {
                files.put(filekey, file);
            }
        }
    }

    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);

    List<Path> paths = new ArrayList<Path>();
    int count = 0;
    BlockLocation[] blkLocations = null;
    for (String filekey : files.keySet()) {
        FileStatus file = files.get(filekey);
        Path path = file.getPath();

        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        if (count == 0) {
            blkLocations = fs.getFileBlockLocations(file, 0, length);
            count++;
        }

        paths.add(new Path(filekey));
    }

    if (paths.size() == 0 || blkLocations == null) {
        splits.add(new MultiFormatStorageSplit(new Path[0], new String[0]));
        return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
    }

    int blkIndex = getBlockIndex(blkLocations, 0);
    MultiFormatStorageSplit split = new MultiFormatStorageSplit(paths.toArray(new Path[paths.size()]),
            blkLocations[blkIndex].getHosts());
    splits.add(split);

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
}

From source file:StorageEngineClient.HashMultiFileFormatStorageInputFormat.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);

    List<Path> paths = new ArrayList<Path>(10);
    int count = 0;
    BlockLocation[] blkLocations = null;
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();

        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();

        if (count == 0) {
            blkLocations = fs.getFileBlockLocations(file, 0, length);
            count++;//w w w . jav  a 2s  .c o  m
        }

        paths.add(path);
    }

    if (paths.size() == 0 || blkLocations == null) {
        splits.add(new MultiFormatStorageSplit(new Path[0], new String[0]));
        return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
    }

    int blkIndex = getBlockIndex(blkLocations, 0);
    MultiFormatStorageSplit split = new MultiFormatStorageSplit(paths.toArray(new Path[paths.size()]),
            blkLocations[blkIndex].getHosts());
    splits.add(split);

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
}

From source file:wiki.hadoop.mapred.lib.input.StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();/*from   w  w w .  j  a  v a2s .c o  m*/
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        SplitCompressionInputStream is = in.getSplitCompressionInputStream();
        long start = 0;
        long skip = 0;
        if (is != null) {
            start = is.getAdjustedStart();
            length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                SplitCompressionInputStream cin = in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in, split.getStart());

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
    return splits;
}