Example usage for org.apache.hadoop.mapred.lib CombineFileSplit CombineFileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred.lib CombineFileSplit CombineFileSplit.

Prototype

public CombineFileSplit(JobConf job, Path[] files, long[] start, long[] lengths, String[] locations)

Source Link

Usage

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapred.GFInputFormat.java

License:Apache License

/**
 * Creates an input split for every block occupied by hoplogs of the input
 * regions//from www.ja  va  2s. co m
 * 
 * @param job 
 * @param hoplogs
 * @return array of input splits of type file input split
 * @throws IOException
 */
private InputSplit[] createSplits(JobConf job, Collection<FileStatus> hoplogs) throws IOException {
    if (hoplogs == null || hoplogs.isEmpty()) {
        return new InputSplit[0];
    }

    HoplogOptimizedSplitter splitter = new HoplogOptimizedSplitter(hoplogs);
    List<org.apache.hadoop.mapreduce.InputSplit> mr2Splits = splitter.getOptimizedSplits(conf);
    InputSplit[] splits = new InputSplit[mr2Splits.size()];
    int i = 0;
    for (org.apache.hadoop.mapreduce.InputSplit inputSplit : mr2Splits) {
        org.apache.hadoop.mapreduce.lib.input.CombineFileSplit mr2Spit;
        mr2Spit = (org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) inputSplit;

        CombineFileSplit split = new CombineFileSplit(job, mr2Spit.getPaths(), mr2Spit.getStartOffsets(),
                mr2Spit.getLengths(), mr2Spit.getLocations());
        splits[i] = split;
        i++;
    }

    return splits;
}

From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java

License:Apache License

/**
 * Combines a number of file splits into one CombineFileSplit. If number of
 * splits to be combined is one, it returns this split as is without creating
 * a CombineFileSplit./*from   ww  w . j a v  a2 s  .  c o  m*/
 * @param splits
 * @param startIndex
 * @param count
 * @return
 * @throws IOException 
 */
public static InputSplit combineFileSplits(JobConf conf, List<FileSplit> splits, int startIndex, int count)
        throws IOException {
    if (count == 1) {
        return splits.get(startIndex);
    } else {
        Path[] paths = new Path[count];
        long[] starts = new long[count];
        long[] lengths = new long[count];
        Vector<String> vlocations = new Vector<String>();
        while (count > 0) {
            paths[count - 1] = splits.get(startIndex).getPath();
            starts[count - 1] = splits.get(startIndex).getStart();
            lengths[count - 1] = splits.get(startIndex).getLength();
            vlocations.addAll(Arrays.asList(splits.get(startIndex).getLocations()));
            count--;
            startIndex++;
        }
        String[] locations = prioritizeLocations(vlocations);
        return new CombineFileSplit(conf, paths, starts, lengths, locations);
    }
}

From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java

License:Apache License

/**
 * Combines two file splits into a CombineFileSplit.
 * @param conf//from   ww w  .j  a v  a  2s.c o m
 * @param split1
 * @param split2
 * @return
 * @throws IOException 
 */
public static InputSplit combineFileSplits(JobConf conf, FileSplit split1, FileSplit split2)
        throws IOException {
    Path[] paths = new Path[2];
    long[] starts = new long[2];
    long[] lengths = new long[2];
    Vector<String> vlocations = new Vector<String>();
    paths[0] = split1.getPath();
    starts[0] = split1.getStart();
    lengths[0] = split1.getLength();
    vlocations.addAll(Arrays.asList(split1.getLocations()));
    paths[1] = split2.getPath();
    starts[1] = split2.getStart();
    lengths[1] = split2.getLength();
    vlocations.addAll(Arrays.asList(split2.getLocations()));
    String[] locations = prioritizeLocations(vlocations);
    return new CombineFileSplit(conf, paths, starts, lengths, locations);
}

From source file:edu.umn.cs.spatialHadoop.mapred.FileSplitUtil.java

License:Open Source License

/**
 * Combines a number of file splits into one CombineFileSplit. If number of
 * splits to be combined is one, it returns this split as is without creating
 * a CombineFileSplit.//from  w  ww  .ja v  a2 s . c o  m
 * @param splits
 * @param startIndex
 * @param count
 * @return
 * @throws IOException 
 */
public static InputSplit combineFileSplits(JobConf conf, List<FileSplit> splits, int startIndex, int count)
        throws IOException {
    if (count == 1) {
        return splits.get(startIndex);
    } else {
        Path[] paths = new Path[count];
        long[] starts = new long[count];
        long[] lengths = new long[count];
        Vector<String> vlocations = new Vector<String>();
        while (count > 0) {
            paths[count - 1] = splits.get(startIndex).getPath();
            starts[count - 1] = splits.get(startIndex).getStart();
            lengths[count - 1] = splits.get(startIndex).getLength();
            vlocations.addAll(Arrays.asList(splits.get(startIndex).getLocations()));
            count--;
            startIndex++;
        }
        String[] locations = prioritizeLocations(vlocations);
        if (locations.length > 3) {
            String[] topLocations = new String[3];
            System.arraycopy(locations, 0, topLocations, 0, topLocations.length);
            locations = topLocations;
        }
        return new CombineFileSplit(conf, paths, starts, lengths, locations);
    }
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java

License:Mozilla Public License

@SuppressWarnings("unchecked")
@Override//  w ww  .  j a va2s.c o  m
/**
 * Splits the input collection into
 * sets of files where each Map task 
 * gets about the same number of files
 */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
        return new InputSplit[0];
    }

    if (numSplits > paths.length) {
        numSplits = paths.length;
    } else if (numSplits < 1) {
        numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(
            numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array
            .newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
        final FileStatus fss = fs.getFileStatus(paths[i]);
        lengths[i] = fss.getLen();
        final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>();
        final long normalblocksize = fss.getBlockSize();
        for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
            final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
            final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
            for (BlockLocation bl : blockLocations) {
                for (String host : bl.getHosts()) {
                    location2size.adjustOrPutValue(host, blocksize, blocksize);
                }
            }
        }
    }

    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
        /* caclulate split size for this task - usually numberOfFilesPerSplit, but
         * less than this for the last split */
        final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed
                : numberOfFilesPerSplit;
        //arrays of information for split
        Path[] splitPaths = new Path[splitSizeForThisSplit];
        long[] splitLengths = new long[splitSizeForThisSplit];
        long[] splitStarts = new long[splitSizeForThisSplit];
        final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
        String[] splitLocations = null; //final recommended locations for this split.
        for (int i = 0; i < splitSizeForThisSplit; i++) {
            locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() {
                public boolean execute(String a, long b) {
                    allLocationsForSplit.adjustOrPutValue(a, b, b);
                    return true;
                }
            });
            if (allLocationsForSplit.size() <= 3) {
                splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
            } else {
                String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
                Arrays.sort(hosts, new Comparator<String>() {
                    public int compare(String o1, String o2) {
                        long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                        if (diffamount > 0) {
                            return -1;
                        } else if (diffamount < 0) {
                            return 1;
                        }
                        return 0;
                    }
                });
                splitLocations = new String[3];
                System.arraycopy(hosts, 0, splitLocations, 0, 3);
            }
        }

        //copy information for this split
        System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
        System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
        //count the number of paths consumed
        pathsUsed += splitSizeForThisSplit;

        //make the actual split object
        //logger.info("New split of size " + splitSizeForThisSplit);
        mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
        splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
        splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
        throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
}