List of usage examples for org.apache.hadoop.mapred.lib CombineFileSplit CombineFileSplit
public CombineFileSplit(JobConf job, Path[] files, long[] start, long[] lengths, String[] locations)
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapred.GFInputFormat.java
License:Apache License
/** * Creates an input split for every block occupied by hoplogs of the input * regions//from www.ja va 2s. co m * * @param job * @param hoplogs * @return array of input splits of type file input split * @throws IOException */ private InputSplit[] createSplits(JobConf job, Collection<FileStatus> hoplogs) throws IOException { if (hoplogs == null || hoplogs.isEmpty()) { return new InputSplit[0]; } HoplogOptimizedSplitter splitter = new HoplogOptimizedSplitter(hoplogs); List<org.apache.hadoop.mapreduce.InputSplit> mr2Splits = splitter.getOptimizedSplits(conf); InputSplit[] splits = new InputSplit[mr2Splits.size()]; int i = 0; for (org.apache.hadoop.mapreduce.InputSplit inputSplit : mr2Splits) { org.apache.hadoop.mapreduce.lib.input.CombineFileSplit mr2Spit; mr2Spit = (org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) inputSplit; CombineFileSplit split = new CombineFileSplit(job, mr2Spit.getPaths(), mr2Spit.getStartOffsets(), mr2Spit.getLengths(), mr2Spit.getLocations()); splits[i] = split; i++; } return splits; }
From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java
License:Apache License
/** * Combines a number of file splits into one CombineFileSplit. If number of * splits to be combined is one, it returns this split as is without creating * a CombineFileSplit./*from ww w . j a v a2 s . c o m*/ * @param splits * @param startIndex * @param count * @return * @throws IOException */ public static InputSplit combineFileSplits(JobConf conf, List<FileSplit> splits, int startIndex, int count) throws IOException { if (count == 1) { return splits.get(startIndex); } else { Path[] paths = new Path[count]; long[] starts = new long[count]; long[] lengths = new long[count]; Vector<String> vlocations = new Vector<String>(); while (count > 0) { paths[count - 1] = splits.get(startIndex).getPath(); starts[count - 1] = splits.get(startIndex).getStart(); lengths[count - 1] = splits.get(startIndex).getLength(); vlocations.addAll(Arrays.asList(splits.get(startIndex).getLocations())); count--; startIndex++; } String[] locations = prioritizeLocations(vlocations); return new CombineFileSplit(conf, paths, starts, lengths, locations); } }
From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java
License:Apache License
/** * Combines two file splits into a CombineFileSplit. * @param conf//from ww w .j a v a 2s.c o m * @param split1 * @param split2 * @return * @throws IOException */ public static InputSplit combineFileSplits(JobConf conf, FileSplit split1, FileSplit split2) throws IOException { Path[] paths = new Path[2]; long[] starts = new long[2]; long[] lengths = new long[2]; Vector<String> vlocations = new Vector<String>(); paths[0] = split1.getPath(); starts[0] = split1.getStart(); lengths[0] = split1.getLength(); vlocations.addAll(Arrays.asList(split1.getLocations())); paths[1] = split2.getPath(); starts[1] = split2.getStart(); lengths[1] = split2.getLength(); vlocations.addAll(Arrays.asList(split2.getLocations())); String[] locations = prioritizeLocations(vlocations); return new CombineFileSplit(conf, paths, starts, lengths, locations); }
From source file:edu.umn.cs.spatialHadoop.mapred.FileSplitUtil.java
License:Open Source License
/** * Combines a number of file splits into one CombineFileSplit. If number of * splits to be combined is one, it returns this split as is without creating * a CombineFileSplit.//from w ww .ja v a2 s . c o m * @param splits * @param startIndex * @param count * @return * @throws IOException */ public static InputSplit combineFileSplits(JobConf conf, List<FileSplit> splits, int startIndex, int count) throws IOException { if (count == 1) { return splits.get(startIndex); } else { Path[] paths = new Path[count]; long[] starts = new long[count]; long[] lengths = new long[count]; Vector<String> vlocations = new Vector<String>(); while (count > 0) { paths[count - 1] = splits.get(startIndex).getPath(); starts[count - 1] = splits.get(startIndex).getStart(); lengths[count - 1] = splits.get(startIndex).getLength(); vlocations.addAll(Arrays.asList(splits.get(startIndex).getLocations())); count--; startIndex++; } String[] locations = prioritizeLocations(vlocations); if (locations.length > 3) { String[] topLocations = new String[3]; System.arraycopy(locations, 0, topLocations, 0, topLocations.length); locations = topLocations; } return new CombineFileSplit(conf, paths, starts, lengths, locations); } }
From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java
License:Mozilla Public License
@SuppressWarnings("unchecked") @Override// w ww . j a va2s.c o m /** * Splits the input collection into * sets of files where each Map task * gets about the same number of files */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path[] paths = FileInputFormat.getInputPaths(job); // HADOOP-1818: Manage splits only if there are paths if (paths.length == 0) { return new InputSplit[0]; } if (numSplits > paths.length) { numSplits = paths.length; } else if (numSplits < 1) { numSplits = 1; } logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks"); List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>( numSplits); final int numPaths = paths.length; long[] lengths = new long[numPaths]; TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array .newInstance(TObjectLongHashMap.class, numPaths); final FileSystem fs = FileSystem.get(job); for (int i = 0; i < paths.length; i++) { final FileStatus fss = fs.getFileStatus(paths[i]); lengths[i] = fss.getLen(); final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>(); final long normalblocksize = fss.getBlockSize(); for (long offset = 0; offset < lengths[i]; offset += normalblocksize) { final long blocksize = Math.min(offset + normalblocksize, lengths[i]); final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize); for (BlockLocation bl : blockLocations) { for (String host : bl.getHosts()) { location2size.adjustOrPutValue(host, blocksize, blocksize); } } } } //we need to over-estimate using ceil, to ensure that the last split is not /too/ big final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits); int pathsUsed = 0; int splitnum = 0; CombineFileSplit mfs; // for each split except the last one (which may be smaller than numberOfFilesPerSplit) while (pathsUsed < numPaths) { /* caclulate split size for this task - usually numberOfFilesPerSplit, but * less than this for the last split */ final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed : numberOfFilesPerSplit; //arrays of information for split Path[] splitPaths = new Path[splitSizeForThisSplit]; long[] splitLengths = new long[splitSizeForThisSplit]; long[] splitStarts = new long[splitSizeForThisSplit]; final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>(); String[] splitLocations = null; //final recommended locations for this split. for (int i = 0; i < splitSizeForThisSplit; i++) { locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() { public boolean execute(String a, long b) { allLocationsForSplit.adjustOrPutValue(a, b, b); return true; } }); if (allLocationsForSplit.size() <= 3) { splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); } else { String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); Arrays.sort(hosts, new Comparator<String>() { public int compare(String o1, String o2) { long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2); if (diffamount > 0) { return -1; } else if (diffamount < 0) { return 1; } return 0; } }); splitLocations = new String[3]; System.arraycopy(hosts, 0, splitLocations, 0, 3); } } //copy information for this split System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit); System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit); //count the number of paths consumed pathsUsed += splitSizeForThisSplit; //make the actual split object //logger.info("New split of size " + splitSizeForThisSplit); mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations); splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum)); splitnum++; } if (!(pathsUsed == paths.length)) { throw new IOException("Number of used paths does not equal total available paths!"); } return splits.toArray(new PositionAwareSplit[splits.size()]); }