List of usage examples for org.apache.hadoop.mapreduce InputSplit getLength
public abstract long getLength() throws IOException, InterruptedException;
From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java
License:Apache License
public static double getTotalMapInputMB(Job job) throws ClassNotFoundException, IOException, InterruptedException, JobException { if (job == null) { throw new JobException("Job is null"); }//from w w w .j a v a 2 s . co m long mapInputBytes = 0; InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); for (InputSplit split : input.getSplits(job)) { mapInputBytes += split.getLength(); } if (mapInputBytes == 0) { throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!"); } double totalMapInputMB = (double) mapInputBytes / 1024 / 1024; return totalMapInputMB; }
From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.Builder.java
License:Apache License
/** * sort the splits into order based on size, so that the biggest go first.<br> * This is the same code used by Hadoop's JobClient. * //from w w w .ja v a 2 s . c o m * @param splits * input splits */ public static void sortSplits(InputSplit[] splits) { Arrays.sort(splits, new Comparator<InputSplit>() { @Override public int compare(InputSplit a, InputSplit b) { try { long left = a.getLength(); long right = b.getLength(); if (left == right) { return 0; } else if (left < right) { return 1; } else { return -1; } } catch (IOException ie) { throw new IllegalStateException("Problem getting input split size", ie); } catch (InterruptedException ie) { throw new IllegalStateException("Problem getting input split size", ie); } } }); }
From source file:org.apache.parquet.pig.ParquetLoader.java
License:Apache License
@Override public ResourceStatistics getStatistics(String location, Job job) throws IOException { if (DEBUG)/*from ww w .j a va 2 s .c o m*/ LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")"); /* We need to call setInput since setLocation is not guaranteed to be called before this */ setInput(location, job); long length = 0; try { for (InputSplit split : getParquetInputFormat().getSplits(job)) { length += split.getLength(); } } catch (InterruptedException e) { LOG.warn("Interrupted: ", e); return null; } ResourceStatistics stats = new ResourceStatistics(); // TODO use pig-0.12 setBytes api when its available stats.setmBytes(length / 1024 / 1024); return stats; }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit.java
License:Apache License
@Override @SuppressWarnings("unchecked") public String[] getLocations() throws IOException, InterruptedException { if (locations == null) { HashMap<String, Long> locMap = new HashMap<String, Long>(); Long lenInMap;// w ww .ja va2 s . co m for (InputSplit split : wrappedSplits) { String[] locs = split.getLocations(); for (String loc : locs) { if ((lenInMap = locMap.get(loc)) == null) locMap.put(loc, split.getLength()); else locMap.put(loc, lenInMap + split.getLength()); } } Set<Map.Entry<String, Long>> entrySet = locMap.entrySet(); Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]); Arrays.sort(hostSize, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Entry<String, Long> o1, Entry<String, Long> o2) { long diff = o1.getValue() - o2.getValue(); if (diff < 0) return 1; if (diff > 0) return -1; return 0; } }); // maximum 5 locations are in list: refer to PIG-1648 for more details int nHost = Math.min(hostSize.length, 5); locations = new String[nHost]; for (int i = 0; i < nHost; ++i) { locations[i] = hostSize[i].getKey(); } } return locations; }
From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java
License:Apache License
public static List<List<InputSplit>> getCombinePigSplits(List<InputSplit> oneInputSplits, long maxCombinedSplitSize, Configuration conf) throws IOException, InterruptedException { ArrayList<Node> nodes = new ArrayList<Node>(); HashMap<String, Node> nodeMap = new HashMap<String, Node>(); List<List<InputSplit>> result = new ArrayList<List<InputSplit>>(); List<Long> resultLengths = new ArrayList<Long>(); long comparableSplitId = 0; int size = 0, nSplits = oneInputSplits.size(); InputSplit lastSplit = null;//from w w w . j av a 2s. c o m int emptyCnt = 0; for (InputSplit split : oneInputSplits) { if (split.getLength() == 0) { emptyCnt++; continue; } if (split.getLength() >= maxCombinedSplitSize) { comparableSplitId++; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(split); result.add(combinedSplits); resultLengths.add(split.getLength()); } else { ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++); String[] locations = split.getLocations(); // sort the locations to stabilize the number of maps: PIG-1757 Arrays.sort(locations); HashSet<String> locationSeen = new HashSet<String>(); for (String location : locations) { if (!locationSeen.contains(location)) { Node node = nodeMap.get(location); if (node == null) { node = new Node(); nodes.add(node); nodeMap.put(location, node); } node.add(csplit); csplit.add(node); locationSeen.add(location); } } lastSplit = split; size++; } } /* verification code: debug purpose { ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>(); HashSet<InputSplit> seen = new HashSet<InputSplit>(); for (Node node : nodes) { if (node.getLength() > 0) { ArrayList<ComparableSplit> splits = node.getSplits(); for (ComparableSplit split : splits) { if (!seen.contains(split.getSplit())) { // remove duplicates. The set has to be on the raw input split not the // comparable input split as the latter overrides the compareTo method // so its equality semantics is changed and not we want here seen.add(split.getSplit()); leftoverSplits.add(split); } } } } int combinedSplitLen = 0; for (PigSplit split : result) combinedSplitLen += split.getNumPaths(); if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) { throw new AssertionError("number of combined splits {"+combinedSplitLen+"+"+leftoverSplits.size()+"-"+size+"} does not match the number of original splits ["+nSplits+"]."); } } */ if (nSplits > 0 && emptyCnt == nSplits) { // if all splits are empty, add a single empty split as currently an empty directory is // not properly handled somewhere ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(oneInputSplits.get(0)); result.add(combinedSplits); } else if (size == 1) { ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(lastSplit); result.add(combinedSplits); } else if (size > 1) { // combine small splits Collections.sort(nodes, nodeComparator); DummySplit dummy = new DummySplit(); // dummy is used to search for next split of suitable size to be combined ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1); for (Node node : nodes) { // sort the splits on this node in descending order node.sort(); long totalSize = 0; ArrayList<ComparableSplit> splits = node.getSplits(); int idx; int lenSplits; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); while (!splits.isEmpty()) { combinedSplits.add(splits.get(0).getSplit()); combinedComparableSplits.add(splits.get(0)); int startIdx = 1; lenSplits = splits.size(); totalSize += splits.get(0).getSplit().getLength(); long spaceLeft = maxCombinedSplitSize - totalSize; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; while (idx < lenSplits) { long thisLen = splits.get(idx).getSplit().getLength(); combinedSplits.add(splits.get(idx).getSplit()); combinedComparableSplits.add(splits.get(idx)); totalSize += thisLen; spaceLeft -= thisLen; if (spaceLeft <= 0) break; // find next combinable chunk startIdx = idx + 1; if (startIdx >= lenSplits) break; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; } if (totalSize > maxCombinedSplitSize / 2) { result.add(combinedSplits); resultLengths.add(totalSize); removeSplits(combinedComparableSplits); totalSize = 0; combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); splits = node.getSplits(); } else { if (combinedSplits.size() != lenSplits) throw new AssertionError("Combined split logic error!"); break; } } } // handle leftovers ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>(); HashSet<InputSplit> seen = new HashSet<InputSplit>(); for (Node node : nodes) { for (ComparableSplit split : node.getSplits()) { if (!seen.contains(split.getSplit())) { // remove duplicates. The set has to be on the raw input split not the // comparable input split as the latter overrides the compareTo method // so its equality semantics is changed and not we want here seen.add(split.getSplit()); leftoverSplits.add(split); } } } /* verification code int combinedSplitLen = 0; for (PigSplit split : result) combinedSplitLen += split.getNumPaths(); if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) throw new AssertionError("number of combined splits ["+combinedSplitLen+"+"+leftoverSplits.size()+"] does not match the number of original splits ["+nSplits+"]."); */ if (!leftoverSplits.isEmpty()) { long totalSize = 0; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); int splitLen = leftoverSplits.size(); for (int i = 0; i < splitLen; i++) { ComparableSplit split = leftoverSplits.get(i); long thisLen = split.getSplit().getLength(); if (totalSize + thisLen >= maxCombinedSplitSize) { removeSplits(combinedComparableSplits); result.add(combinedSplits); resultLengths.add(totalSize); combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); totalSize = 0; } combinedSplits.add(split.getSplit()); combinedComparableSplits.add(split); totalSize += split.getSplit().getLength(); if (i == splitLen - 1) { // last piece: it could be very small, try to see it can be squeezed into any existing splits for (int j = 0; j < result.size(); j++) { if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) { List<InputSplit> isList = result.get(j); for (InputSplit csplit : combinedSplits) { isList.add(csplit); } removeSplits(combinedComparableSplits); combinedSplits.clear(); break; } } if (!combinedSplits.isEmpty()) { // last piece can not be squeezed in, create a new combined split for them. removeSplits(combinedComparableSplits); result.add(combinedSplits); } } } } } /* verification codes int combinedSplitLen = 0; for (PigSplit split : result) combinedSplitLen += split.getNumPaths(); if (combinedSplitLen != nSplits-emptyCnt) throw new AssertionError("number of combined splits ["+combinedSplitLen+"] does not match the number of original splits ["+nSplits+"]."); long totalLen = 0; for (PigSplit split : result) totalLen += split.getLength(); long origTotalLen = 0; for (InputSplit split : oneInputSplits) origTotalLen += split.getLength(); if (totalLen != origTotalLen) throw new AssertionError("The total length ["+totalLen+"] does not match the original ["+origTotalLen+"]"); */ log.info("Total input paths (combined) to process : " + result.size()); return result; }
From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java
License:Apache License
public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException { // debugging purpose only StringBuilder st = new StringBuilder(); st.append("Number of splits :" + splits.length + "\n"); long len = 0; for (InputSplit split : splits) len += split.getLength(); st.append("Total Length = " + len + "\n"); for (int i = 0; i < splits.length; i++) { st.append("Input split[" + i + "]:\n Length = " + splits[i].getLength() + "\n Locations:\n"); for (String location : splits[i].getLocations()) st.append(" " + location + "\n"); st.append("\n-----------------------\n"); }/* w ww. j a va2 s.c o m*/ return st.toString(); }
From source file:org.apache.sqoop.mapreduce.hcat.SqoopHCatExportFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> hCatSplits = super.getSplits(job); int hCatSplitCount = hCatSplits.size(); int expectedSplitCount = ExportInputFormat.getNumMapTasks(job); if (expectedSplitCount == 0) { expectedSplitCount = hCatSplitCount; }//from w w w .j a va 2 s .c om LOG.debug("Expected split count " + expectedSplitCount); LOG.debug("HCatInputFormat provided split count " + hCatSplitCount); // Sort the splits by length descending. Collections.sort(hCatSplits, new Comparator<InputSplit>() { @Override public int compare(InputSplit is1, InputSplit is2) { try { return (int) (is2.getLength() - is1.getLength()); } catch (Exception e) { LOG.warn("Exception caught while sorting Input splits " + e); } return 0; } }); List<InputSplit> combinedSplits = new ArrayList<InputSplit>(); // The number of splits generated by HCatInputFormat is within // our limits if (hCatSplitCount <= expectedSplitCount) { for (InputSplit split : hCatSplits) { List<InputSplit> hcSplitList = new ArrayList<InputSplit>(); hcSplitList.add(split); combinedSplits.add(new SqoopHCatInputSplit(hcSplitList)); } return combinedSplits; } List<List<InputSplit>> combinedSplitList = new ArrayList<List<InputSplit>>(); for (int i = 0; i < expectedSplitCount; i++) { combinedSplitList.add(new ArrayList<InputSplit>()); } boolean ascendingAssigment = true; int lastSet = 0; for (int i = 0; i < hCatSplitCount; ++i) { int splitNum = i % expectedSplitCount; int currentSet = i / expectedSplitCount; if (currentSet != lastSet) { ascendingAssigment = !ascendingAssigment; } if (ascendingAssigment) { combinedSplitList.get(splitNum).add(hCatSplits.get(i)); } else { combinedSplitList.get(expectedSplitCount - 1 - splitNum).add(hCatSplits.get(i)); } lastSet = currentSet; } for (int i = 0; i < expectedSplitCount; i++) { SqoopHCatInputSplit sqoopSplit = new SqoopHCatInputSplit(combinedSplitList.get(i)); combinedSplits.add(sqoopSplit); } return combinedSplits; }
From source file:org.apache.tez.mapreduce.common.TestMRInputAMSplitGenerator.java
License:Apache License
private void testGroupSplitsAndSortSplits(boolean groupSplitsEnabled, boolean sortSplitsEnabled) throws Exception { Configuration conf = new Configuration(); String[] splitLengths = new String[50]; for (int i = 0; i < splitLengths.length; i++) { splitLengths[i] = Integer.toString(1000 * (i + 1)); }/*from ww w .j a v a 2 s.c om*/ conf.setStrings(SPLITS_LENGTHS, splitLengths); DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, InputFormatForTest.class) .groupSplits(groupSplitsEnabled).sortSplits(sortSplitsEnabled).build(); UserPayload userPayload = dataSource.getInputDescriptor().getUserPayload(); InputInitializerContext context = new TezTestUtils.TezRootInputInitializerContextForTest(userPayload); MRInputAMSplitGenerator splitGenerator = new MRInputAMSplitGenerator(context); List<Event> events = splitGenerator.initialize(); assertTrue(events.get(0) instanceof InputConfigureVertexTasksEvent); boolean shuffled = false; InputSplit previousIs = null; int numRawInputSplits = 0; for (int i = 1; i < events.size(); i++) { assertTrue(events.get(i) instanceof InputDataInformationEvent); InputDataInformationEvent diEvent = (InputDataInformationEvent) (events.get(i)); assertNull(diEvent.getDeserializedUserPayload()); assertNotNull(diEvent.getUserPayload()); MRSplitProto eventProto = MRSplitProto.parseFrom(ByteString.copyFrom(diEvent.getUserPayload())); InputSplit is = MRInputUtils.getNewSplitDetailsFromEvent(eventProto, new Configuration()); if (groupSplitsEnabled) { numRawInputSplits += ((TezGroupedSplit) is).getGroupedSplits().size(); for (InputSplit inputSplit : ((TezGroupedSplit) is).getGroupedSplits()) { assertTrue(inputSplit instanceof InputSplitForTest); } assertTrue(((TezGroupedSplit) is).getGroupedSplits().get(0) instanceof InputSplitForTest); } else { numRawInputSplits++; assertTrue(is instanceof InputSplitForTest); } // The splits in the list returned from InputFormat has ascending // size in order. // If sortSplitsEnabled is true, MRInputAMSplitGenerator will sort the // splits in descending order. // If sortSplitsEnabled is false, MRInputAMSplitGenerator will shuffle // the splits. if (previousIs != null) { if (sortSplitsEnabled) { assertTrue(is.getLength() <= previousIs.getLength()); } else { shuffled |= (is.getLength() > previousIs.getLength()); } } previousIs = is; } assertEquals(splitLengths.length, numRawInputSplits); if (!sortSplitsEnabled) { assertTrue(shuffled); } }
From source file:parquet.pig.ParquetLoader.java
License:Apache License
@Override public ResourceStatistics getStatistics(String location, Job job) throws IOException { if (DEBUG)//from w w w. j a v a2s .co m LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")"); // We do not need to call setInput // as setLocation is guaranteed to be called before this long length = 0; try { for (InputSplit split : getParquetInputFormat().getSplits(job)) { length += split.getLength(); } } catch (InterruptedException e) { LOG.warn("Interrupted: ", e); return null; } ResourceStatistics stats = new ResourceStatistics(); // TODO use pig-0.12 setBytes api when its available stats.setmBytes(length / 1024 / 1024); return stats; }