Example usage for org.apache.hadoop.mapreduce InputSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLength.

Prototype

public abstract long getLength() throws IOException, InterruptedException;

Source Link

Document

Get the size of the split, so that the input splits can be sorted by size.

Usage

From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java

License:Apache License

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }//from  w  w  w .j  a  v a 2 s  . co  m

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.Builder.java

License:Apache License

/**
 * sort the splits into order based on size, so that the biggest go first.<br>
 * This is the same code used by Hadoop's JobClient.
 * //from w w  w .ja  v a 2 s  .  c o m
 * @param splits
 *          input splits
 */
public static void sortSplits(InputSplit[] splits) {
    Arrays.sort(splits, new Comparator<InputSplit>() {
        @Override
        public int compare(InputSplit a, InputSplit b) {
            try {
                long left = a.getLength();
                long right = b.getLength();
                if (left == right) {
                    return 0;
                } else if (left < right) {
                    return 1;
                } else {
                    return -1;
                }
            } catch (IOException ie) {
                throw new IllegalStateException("Problem getting input split size", ie);
            } catch (InterruptedException ie) {
                throw new IllegalStateException("Problem getting input split size", ie);
            }
        }
    });
}

From source file:org.apache.parquet.pig.ParquetLoader.java

License:Apache License

@Override
public ResourceStatistics getStatistics(String location, Job job) throws IOException {
    if (DEBUG)/*from ww w .j  a  va  2 s .c  o m*/
        LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")");
    /* We need to call setInput since setLocation is not
       guaranteed to be called before this */
    setInput(location, job);
    long length = 0;
    try {
        for (InputSplit split : getParquetInputFormat().getSplits(job)) {
            length += split.getLength();
        }
    } catch (InterruptedException e) {
        LOG.warn("Interrupted: ", e);
        return null;
    }
    ResourceStatistics stats = new ResourceStatistics();
    // TODO use pig-0.12 setBytes api when its available
    stats.setmBytes(length / 1024 / 1024);
    return stats;
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit.java

License:Apache License

@Override
@SuppressWarnings("unchecked")
public String[] getLocations() throws IOException, InterruptedException {
    if (locations == null) {
        HashMap<String, Long> locMap = new HashMap<String, Long>();
        Long lenInMap;// w ww  .ja va2 s .  co  m
        for (InputSplit split : wrappedSplits) {
            String[] locs = split.getLocations();
            for (String loc : locs) {
                if ((lenInMap = locMap.get(loc)) == null)
                    locMap.put(loc, split.getLength());
                else
                    locMap.put(loc, lenInMap + split.getLength());
            }
        }
        Set<Map.Entry<String, Long>> entrySet = locMap.entrySet();
        Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]);
        Arrays.sort(hostSize, new Comparator<Map.Entry<String, Long>>() {

            @Override
            public int compare(Entry<String, Long> o1, Entry<String, Long> o2) {
                long diff = o1.getValue() - o2.getValue();
                if (diff < 0)
                    return 1;
                if (diff > 0)
                    return -1;
                return 0;
            }
        });
        // maximum 5 locations are in list: refer to PIG-1648 for more details
        int nHost = Math.min(hostSize.length, 5);
        locations = new String[nHost];
        for (int i = 0; i < nHost; ++i) {
            locations[i] = hostSize[i].getKey();
        }
    }
    return locations;
}

From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java

License:Apache License

public static List<List<InputSplit>> getCombinePigSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize, Configuration conf) throws IOException, InterruptedException {
    ArrayList<Node> nodes = new ArrayList<Node>();
    HashMap<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;//from  w w  w .  j av  a 2s. c o  m
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }
    /* verification code: debug purpose
    {
      ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
      HashSet<InputSplit> seen = new HashSet<InputSplit>();
      for (Node node : nodes) {
    if (node.getLength() > 0)
    {
      ArrayList<ComparableSplit> splits = node.getSplits();
      for (ComparableSplit split : splits) {
        if (!seen.contains(split.getSplit())) {
          // remove duplicates. The set has to be on the raw input split not the
          // comparable input split as the latter overrides the compareTo method
          // so its equality semantics is changed and not we want here
          seen.add(split.getSplit());
          leftoverSplits.add(split);
        }
      }
    }
      }
            
      int combinedSplitLen = 0;
      for (PigSplit split : result)
    combinedSplitLen += split.getNumPaths();
      if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) {
    throw new AssertionError("number of combined splits {"+combinedSplitLen+"+"+leftoverSplits.size()+"-"+size+"} does not match the number of original splits ["+nSplits+"].");
      }
    }
    */
    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            ArrayList<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        HashSet<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        /* verification code
        int combinedSplitLen = 0;
        for (PigSplit split : result)
          combinedSplitLen += split.getNumPaths();
        if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt)
          throw new AssertionError("number of combined splits ["+combinedSplitLen+"+"+leftoverSplits.size()+"] does not match the number of original splits ["+nSplits+"].");
        */
        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    /* verification codes
    int combinedSplitLen = 0;
    for (PigSplit split : result)
      combinedSplitLen += split.getNumPaths();
    if (combinedSplitLen != nSplits-emptyCnt)
      throw new AssertionError("number of combined splits ["+combinedSplitLen+"] does not match the number of original splits ["+nSplits+"].");
            
    long totalLen = 0;
    for (PigSplit split : result)
      totalLen += split.getLength();
            
    long origTotalLen = 0;
    for (InputSplit split : oneInputSplits)
      origTotalLen += split.getLength();
    if (totalLen != origTotalLen)
      throw new AssertionError("The total length ["+totalLen+"] does not match the original ["+origTotalLen+"]");
    */
    log.info("Total input paths (combined) to process : " + result.size());
    return result;
}

From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java

License:Apache License

public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException {
    // debugging purpose only
    StringBuilder st = new StringBuilder();
    st.append("Number of splits :" + splits.length + "\n");
    long len = 0;
    for (InputSplit split : splits)
        len += split.getLength();
    st.append("Total Length = " + len + "\n");
    for (int i = 0; i < splits.length; i++) {
        st.append("Input split[" + i + "]:\n   Length = " + splits[i].getLength() + "\n  Locations:\n");
        for (String location : splits[i].getLocations())
            st.append("    " + location + "\n");
        st.append("\n-----------------------\n");
    }/* w ww.  j a va2 s.c o m*/
    return st.toString();
}

From source file:org.apache.sqoop.mapreduce.hcat.SqoopHCatExportFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> hCatSplits = super.getSplits(job);
    int hCatSplitCount = hCatSplits.size();
    int expectedSplitCount = ExportInputFormat.getNumMapTasks(job);
    if (expectedSplitCount == 0) {
        expectedSplitCount = hCatSplitCount;
    }//from  w w w .j a  va  2 s .c om
    LOG.debug("Expected split count " + expectedSplitCount);
    LOG.debug("HCatInputFormat provided split count " + hCatSplitCount);
    // Sort the splits by length descending.

    Collections.sort(hCatSplits, new Comparator<InputSplit>() {
        @Override
        public int compare(InputSplit is1, InputSplit is2) {
            try {
                return (int) (is2.getLength() - is1.getLength());
            } catch (Exception e) {
                LOG.warn("Exception caught while sorting Input splits " + e);
            }
            return 0;
        }
    });
    List<InputSplit> combinedSplits = new ArrayList<InputSplit>();

    // The number of splits generated by HCatInputFormat is within
    // our limits

    if (hCatSplitCount <= expectedSplitCount) {
        for (InputSplit split : hCatSplits) {
            List<InputSplit> hcSplitList = new ArrayList<InputSplit>();
            hcSplitList.add(split);
            combinedSplits.add(new SqoopHCatInputSplit(hcSplitList));
        }
        return combinedSplits;
    }
    List<List<InputSplit>> combinedSplitList = new ArrayList<List<InputSplit>>();
    for (int i = 0; i < expectedSplitCount; i++) {
        combinedSplitList.add(new ArrayList<InputSplit>());
    }
    boolean ascendingAssigment = true;

    int lastSet = 0;
    for (int i = 0; i < hCatSplitCount; ++i) {
        int splitNum = i % expectedSplitCount;
        int currentSet = i / expectedSplitCount;
        if (currentSet != lastSet) {
            ascendingAssigment = !ascendingAssigment;
        }
        if (ascendingAssigment) {
            combinedSplitList.get(splitNum).add(hCatSplits.get(i));
        } else {
            combinedSplitList.get(expectedSplitCount - 1 - splitNum).add(hCatSplits.get(i));
        }
        lastSet = currentSet;
    }
    for (int i = 0; i < expectedSplitCount; i++) {
        SqoopHCatInputSplit sqoopSplit = new SqoopHCatInputSplit(combinedSplitList.get(i));
        combinedSplits.add(sqoopSplit);
    }

    return combinedSplits;

}

From source file:org.apache.tez.mapreduce.common.TestMRInputAMSplitGenerator.java

License:Apache License

private void testGroupSplitsAndSortSplits(boolean groupSplitsEnabled, boolean sortSplitsEnabled)
        throws Exception {
    Configuration conf = new Configuration();
    String[] splitLengths = new String[50];
    for (int i = 0; i < splitLengths.length; i++) {
        splitLengths[i] = Integer.toString(1000 * (i + 1));
    }/*from  ww w  .j  a v a 2  s.c om*/
    conf.setStrings(SPLITS_LENGTHS, splitLengths);
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, InputFormatForTest.class)
            .groupSplits(groupSplitsEnabled).sortSplits(sortSplitsEnabled).build();
    UserPayload userPayload = dataSource.getInputDescriptor().getUserPayload();

    InputInitializerContext context = new TezTestUtils.TezRootInputInitializerContextForTest(userPayload);
    MRInputAMSplitGenerator splitGenerator = new MRInputAMSplitGenerator(context);

    List<Event> events = splitGenerator.initialize();

    assertTrue(events.get(0) instanceof InputConfigureVertexTasksEvent);
    boolean shuffled = false;
    InputSplit previousIs = null;
    int numRawInputSplits = 0;
    for (int i = 1; i < events.size(); i++) {
        assertTrue(events.get(i) instanceof InputDataInformationEvent);
        InputDataInformationEvent diEvent = (InputDataInformationEvent) (events.get(i));
        assertNull(diEvent.getDeserializedUserPayload());
        assertNotNull(diEvent.getUserPayload());
        MRSplitProto eventProto = MRSplitProto.parseFrom(ByteString.copyFrom(diEvent.getUserPayload()));
        InputSplit is = MRInputUtils.getNewSplitDetailsFromEvent(eventProto, new Configuration());
        if (groupSplitsEnabled) {
            numRawInputSplits += ((TezGroupedSplit) is).getGroupedSplits().size();
            for (InputSplit inputSplit : ((TezGroupedSplit) is).getGroupedSplits()) {
                assertTrue(inputSplit instanceof InputSplitForTest);
            }
            assertTrue(((TezGroupedSplit) is).getGroupedSplits().get(0) instanceof InputSplitForTest);
        } else {
            numRawInputSplits++;
            assertTrue(is instanceof InputSplitForTest);
        }
        // The splits in the list returned from InputFormat has ascending
        // size in order.
        // If sortSplitsEnabled is true, MRInputAMSplitGenerator will sort the
        // splits in descending order.
        // If sortSplitsEnabled is false, MRInputAMSplitGenerator will shuffle
        // the splits.
        if (previousIs != null) {
            if (sortSplitsEnabled) {
                assertTrue(is.getLength() <= previousIs.getLength());
            } else {
                shuffled |= (is.getLength() > previousIs.getLength());
            }
        }
        previousIs = is;
    }
    assertEquals(splitLengths.length, numRawInputSplits);
    if (!sortSplitsEnabled) {
        assertTrue(shuffled);
    }
}

From source file:parquet.pig.ParquetLoader.java

License:Apache License

@Override
public ResourceStatistics getStatistics(String location, Job job) throws IOException {
    if (DEBUG)//from   w w  w.  j  a v  a2s .co m
        LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")");
    // We do not need to call setInput
    // as setLocation is guaranteed to be called before this
    long length = 0;
    try {
        for (InputSplit split : getParquetInputFormat().getSplits(job)) {
            length += split.getLength();
        }
    } catch (InterruptedException e) {
        LOG.warn("Interrupted: ", e);
        return null;
    }
    ResourceStatistics stats = new ResourceStatistics();
    // TODO use pig-0.12 setBytes api when its available
    stats.setmBytes(length / 1024 / 1024);
    return stats;
}