Example usage for org.apache.hadoop.mapred.lib CombineFileSplit CombineFileSplit

List of usage examples for org.apache.hadoop.mapred.lib CombineFileSplit CombineFileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred.lib CombineFileSplit CombineFileSplit.

Prototype

public CombineFileSplit(JobConf job, Path[] files, long[] start, long[] lengths, String[] locations) 

Source Link

Usage

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapred.GFInputFormat.java

License:Apache License

/**
 * Creates an input split for every block occupied by hoplogs of the input
 * regions//from www.ja  va  2s. co m
 * 
 * @param job 
 * @param hoplogs
 * @return array of input splits of type file input split
 * @throws IOException
 */
private InputSplit[] createSplits(JobConf job, Collection<FileStatus> hoplogs) throws IOException {
    if (hoplogs == null || hoplogs.isEmpty()) {
        return new InputSplit[0];
    }

    HoplogOptimizedSplitter splitter = new HoplogOptimizedSplitter(hoplogs);
    List<org.apache.hadoop.mapreduce.InputSplit> mr2Splits = splitter.getOptimizedSplits(conf);
    InputSplit[] splits = new InputSplit[mr2Splits.size()];
    int i = 0;
    for (org.apache.hadoop.mapreduce.InputSplit inputSplit : mr2Splits) {
        org.apache.hadoop.mapreduce.lib.input.CombineFileSplit mr2Spit;
        mr2Spit = (org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) inputSplit;

        CombineFileSplit split = new CombineFileSplit(job, mr2Spit.getPaths(), mr2Spit.getStartOffsets(),
                mr2Spit.getLengths(), mr2Spit.getLocations());
        splits[i] = split;
        i++;
    }

    return splits;
}

From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java

License:Apache License

/**
 * Combines a number of file splits into one CombineFileSplit. If number of
 * splits to be combined is one, it returns this split as is without creating
 * a CombineFileSplit./*from   ww  w . j a v  a2 s  .  c o  m*/
 * @param splits
 * @param startIndex
 * @param count
 * @return
 * @throws IOException 
 */
public static InputSplit combineFileSplits(JobConf conf, List<FileSplit> splits, int startIndex, int count)
        throws IOException {
    if (count == 1) {
        return splits.get(startIndex);
    } else {
        Path[] paths = new Path[count];
        long[] starts = new long[count];
        long[] lengths = new long[count];
        Vector<String> vlocations = new Vector<String>();
        while (count > 0) {
            paths[count - 1] = splits.get(startIndex).getPath();
            starts[count - 1] = splits.get(startIndex).getStart();
            lengths[count - 1] = splits.get(startIndex).getLength();
            vlocations.addAll(Arrays.asList(splits.get(startIndex).getLocations()));
            count--;
            startIndex++;
        }
        String[] locations = prioritizeLocations(vlocations);
        return new CombineFileSplit(conf, paths, starts, lengths, locations);
    }
}

From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java

License:Apache License

/**
 * Combines two file splits into a CombineFileSplit.
 * @param conf//from   ww w  .j  a v  a  2s.c o m
 * @param split1
 * @param split2
 * @return
 * @throws IOException 
 */
public static InputSplit combineFileSplits(JobConf conf, FileSplit split1, FileSplit split2)
        throws IOException {
    Path[] paths = new Path[2];
    long[] starts = new long[2];
    long[] lengths = new long[2];
    Vector<String> vlocations = new Vector<String>();
    paths[0] = split1.getPath();
    starts[0] = split1.getStart();
    lengths[0] = split1.getLength();
    vlocations.addAll(Arrays.asList(split1.getLocations()));
    paths[1] = split2.getPath();
    starts[1] = split2.getStart();
    lengths[1] = split2.getLength();
    vlocations.addAll(Arrays.asList(split2.getLocations()));
    String[] locations = prioritizeLocations(vlocations);
    return new CombineFileSplit(conf, paths, starts, lengths, locations);
}

From source file:edu.umn.cs.spatialHadoop.mapred.FileSplitUtil.java

License:Open Source License

/**
 * Combines a number of file splits into one CombineFileSplit. If number of
 * splits to be combined is one, it returns this split as is without creating
 * a CombineFileSplit.//from  w  ww  .ja v  a2 s . c o  m
 * @param splits
 * @param startIndex
 * @param count
 * @return
 * @throws IOException 
 */
public static InputSplit combineFileSplits(JobConf conf, List<FileSplit> splits, int startIndex, int count)
        throws IOException {
    if (count == 1) {
        return splits.get(startIndex);
    } else {
        Path[] paths = new Path[count];
        long[] starts = new long[count];
        long[] lengths = new long[count];
        Vector<String> vlocations = new Vector<String>();
        while (count > 0) {
            paths[count - 1] = splits.get(startIndex).getPath();
            starts[count - 1] = splits.get(startIndex).getStart();
            lengths[count - 1] = splits.get(startIndex).getLength();
            vlocations.addAll(Arrays.asList(splits.get(startIndex).getLocations()));
            count--;
            startIndex++;
        }
        String[] locations = prioritizeLocations(vlocations);
        if (locations.length > 3) {
            String[] topLocations = new String[3];
            System.arraycopy(locations, 0, topLocations, 0, topLocations.length);
            locations = topLocations;
        }
        return new CombineFileSplit(conf, paths, starts, lengths, locations);
    }
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java

License:Mozilla Public License

@SuppressWarnings("unchecked")
@Override//  w ww  .  j a va2s.c o  m
/**
 * Splits the input collection into
 * sets of files where each Map task 
 * gets about the same number of files
 */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
        return new InputSplit[0];
    }

    if (numSplits > paths.length) {
        numSplits = paths.length;
    } else if (numSplits < 1) {
        numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(
            numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array
            .newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
        final FileStatus fss = fs.getFileStatus(paths[i]);
        lengths[i] = fss.getLen();
        final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>();
        final long normalblocksize = fss.getBlockSize();
        for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
            final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
            final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
            for (BlockLocation bl : blockLocations) {
                for (String host : bl.getHosts()) {
                    location2size.adjustOrPutValue(host, blocksize, blocksize);
                }
            }
        }
    }

    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
        /* caclulate split size for this task - usually numberOfFilesPerSplit, but
         * less than this for the last split */
        final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed
                : numberOfFilesPerSplit;
        //arrays of information for split
        Path[] splitPaths = new Path[splitSizeForThisSplit];
        long[] splitLengths = new long[splitSizeForThisSplit];
        long[] splitStarts = new long[splitSizeForThisSplit];
        final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
        String[] splitLocations = null; //final recommended locations for this split.
        for (int i = 0; i < splitSizeForThisSplit; i++) {
            locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() {
                public boolean execute(String a, long b) {
                    allLocationsForSplit.adjustOrPutValue(a, b, b);
                    return true;
                }
            });
            if (allLocationsForSplit.size() <= 3) {
                splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
            } else {
                String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
                Arrays.sort(hosts, new Comparator<String>() {
                    public int compare(String o1, String o2) {
                        long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                        if (diffamount > 0) {
                            return -1;
                        } else if (diffamount < 0) {
                            return 1;
                        }
                        return 0;
                    }
                });
                splitLocations = new String[3];
                System.arraycopy(hosts, 0, splitLocations, 0, 3);
            }
        }

        //copy information for this split
        System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
        System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
        //count the number of paths consumed
        pathsUsed += splitSizeForThisSplit;

        //make the actual split object
        //logger.info("New split of size " + splitSizeForThisSplit);
        mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
        splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
        splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
        throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
}