Example usage for org.apache.hadoop.mapreduce InputSplit getLength

List of usage examples for org.apache.hadoop.mapreduce InputSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLength.

Prototype

public abstract long getLength() throws IOException, InterruptedException;

Source Link

Document

Get the size of the split, so that the input splits can be sorted by size.

Usage

From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java

License:Apache License

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }//from  w  w  w .j  a  v a 2 s  . co  m

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.Builder.java

License:Apache License

/**
 * sort the splits into order based on size, so that the biggest go first.<br>
 * This is the same code used by Hadoop's JobClient.
 * //from w w  w .ja  v a 2 s  .  c o m
 * @param splits
 *          input splits
 */
public static void sortSplits(InputSplit[] splits) {
    Arrays.sort(splits, new Comparator<InputSplit>() {
        @Override
        public int compare(InputSplit a, InputSplit b) {
            try {
                long left = a.getLength();
                long right = b.getLength();
                if (left == right) {
                    return 0;
                } else if (left < right) {
                    return 1;
                } else {
                    return -1;
                }
            } catch (IOException ie) {
                throw new IllegalStateException("Problem getting input split size", ie);
            } catch (InterruptedException ie) {
                throw new IllegalStateException("Problem getting input split size", ie);
            }
        }
    });
}

From source file:org.apache.parquet.pig.ParquetLoader.java

License:Apache License

@Override
public ResourceStatistics getStatistics(String location, Job job) throws IOException {
    if (DEBUG)/*from ww w .j  a  va  2 s .c  o m*/
        LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")");
    /* We need to call setInput since setLocation is not
       guaranteed to be called before this */
    setInput(location, job);
    long length = 0;
    try {
        for (InputSplit split : getParquetInputFormat().getSplits(job)) {
            length += split.getLength();
        }
    } catch (InterruptedException e) {
        LOG.warn("Interrupted: ", e);
        return null;
    }
    ResourceStatistics stats = new ResourceStatistics();
    // TODO use pig-0.12 setBytes api when its available
    stats.setmBytes(length / 1024 / 1024);
    return stats;
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit.java

License:Apache License

@Override
@SuppressWarnings("unchecked")
public String[] getLocations() throws IOException, InterruptedException {
    if (locations == null) {
        HashMap<String, Long> locMap = new HashMap<String, Long>();
        Long lenInMap;// w ww  .ja va2 s .  co  m
        for (InputSplit split : wrappedSplits) {
            String[] locs = split.getLocations();
            for (String loc : locs) {
                if ((lenInMap = locMap.get(loc)) == null)
                    locMap.put(loc, split.getLength());
                else
                    locMap.put(loc, lenInMap + split.getLength());
            }
        }
        Set<Map.Entry<String, Long>> entrySet = locMap.entrySet();
        Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]);
        Arrays.sort(hostSize, new Comparator<Map.Entry<String, Long>>() {

            @Override
            public int compare(Entry<String, Long> o1, Entry<String, Long> o2) {
                long diff = o1.getValue() - o2.getValue();
                if (diff < 0)
                    return 1;
                if (diff > 0)
                    return -1;
                return 0;
            }
        });
        // maximum 5 locations are in list: refer to PIG-1648 for more details
        int nHost = Math.min(hostSize.length, 5);
        locations = new String[nHost];
        for (int i = 0; i < nHost; ++i) {
            locations[i] = hostSize[i].getKey();
        }
    }
    return locations;
}

From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java

License:Apache License

public static List<List<InputSplit>> getCombinePigSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize, Configuration conf) throws IOException, InterruptedException {
    ArrayList<Node> nodes = new ArrayList<Node>();
    HashMap<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;//from  w w  w .  j av  a 2s. c o  m
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }
    /* verification code: debug purpose
    {
      ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
      HashSet<InputSplit> seen = new HashSet<InputSplit>();
      for (Node node : nodes) {
    if (node.getLength() > 0)
    {
      ArrayList<ComparableSplit> splits = node.getSplits();
      for (ComparableSplit split : splits) {
        if (!seen.contains(split.getSplit())) {
          // remove duplicates. The set has to be on the raw input split not the
          // comparable input split as the latter overrides the compareTo method
          // so its equality semantics is changed and not we want here
          seen.add(split.getSplit());
          leftoverSplits.add(split);
        }
      }
    }
      }
            
      int combinedSplitLen = 0;
      for (PigSplit split : result)
    combinedSplitLen += split.getNumPaths();
      if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) {
    throw new AssertionError("number of combined splits {"+combinedSplitLen+"+"+leftoverSplits.size()+"-"+size+"} does not match the number of original splits ["+nSplits+"].");
      }
    }
    */
    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            ArrayList<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        HashSet<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        /* verification code
        int combinedSplitLen = 0;
        for (PigSplit split : result)
          combinedSplitLen += split.getNumPaths();
        if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt)
          throw new AssertionError("number of combined splits ["+combinedSplitLen+"+"+leftoverSplits.size()+"] does not match the number of original splits ["+nSplits+"].");
        */
        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    /* verification codes
    int combinedSplitLen = 0;
    for (PigSplit split : result)
      combinedSplitLen += split.getNumPaths();
    if (combinedSplitLen != nSplits-emptyCnt)
      throw new AssertionError("number of combined splits ["+combinedSplitLen+"] does not match the number of original splits ["+nSplits+"].");
            
    long totalLen = 0;
    for (PigSplit split : result)
      totalLen += split.getLength();
            
    long origTotalLen = 0;
    for (InputSplit split : oneInputSplits)
      origTotalLen += split.getLength();
    if (totalLen != origTotalLen)
      throw new AssertionError("The total length ["+totalLen+"] does not match the original ["+origTotalLen+"]");
    */
    log.info("Total input paths (combined) to process : " + result.size());
    return result;
}

From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java

License:Apache License

public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException {
    // debugging purpose only
    StringBuilder st = new StringBuilder();
    st.append("Number of splits :" + splits.length + "\n");
    long len = 0;
    for (InputSplit split : splits)
        len += split.getLength();
    st.append("Total Length = " + len + "\n");
    for (int i = 0; i < splits.length; i++) {
        st.append("Input split[" + i + "]:\n   Length = " + splits[i].getLength() + "\n  Locations:\n");
        for (String location : splits[i].getLocations())
            st.append("    " + location + "\n");
        st.append("\n-----------------------\n");
    }/* w ww.  j a va2 s.c o m*/
    return st.toString();
}

From source file:org.apache.sqoop.mapreduce.hcat.SqoopHCatExportFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> hCatSplits = super.getSplits(job);
    int hCatSplitCount = hCatSplits.size();
    int expectedSplitCount = ExportInputFormat.getNumMapTasks(job);
    if (expectedSplitCount == 0) {
        expectedSplitCount = hCatSplitCount;
    }//from  w w w .j a  va  2 s .c om
    LOG.debug("Expected split count " + expectedSplitCount);
    LOG.debug("HCatInputFormat provided split count " + hCatSplitCount);
    // Sort the splits by length descending.

    Collections.sort(hCatSplits, new Comparator<InputSplit>() {
        @Override
        public int compare(InputSplit is1, InputSplit is2) {
            try {
                return (int) (is2.getLength() - is1.getLength());
            } catch (Exception e) {
                LOG.warn("Exception caught while sorting Input splits " + e);
            }
            return 0;
        }
    });
    List<InputSplit> combinedSplits = new ArrayList<InputSplit>();

    // The number of splits generated by HCatInputFormat is within
    // our limits

    if (hCatSplitCount <= expectedSplitCount) {
        for (InputSplit split : hCatSplits) {
            List<InputSplit> hcSplitList = new ArrayList<InputSplit>();
            hcSplitList.add(split);
            combinedSplits.add(new SqoopHCatInputSplit(hcSplitList));
        }
        return combinedSplits;
    }
    List<List<InputSplit>> combinedSplitList = new ArrayList<List<InputSplit>>();
    for (int i = 0; i < expectedSplitCount; i++) {
        combinedSplitList.add(new ArrayList<InputSplit>());
    }
    boolean ascendingAssigment = true;

    int lastSet = 0;
    for (int i = 0; i < hCatSplitCount; ++i) {
        int splitNum = i % expectedSplitCount;
        int currentSet = i / expectedSplitCount;
        if (currentSet != lastSet) {
            ascendingAssigment = !ascendingAssigment;
        }
        if (ascendingAssigment) {
            combinedSplitList.get(splitNum).add(hCatSplits.get(i));
        } else {
            combinedSplitList.get(expectedSplitCount - 1 - splitNum).add(hCatSplits.get(i));
        }
        lastSet = currentSet;
    }
    for (int i = 0; i < expectedSplitCount; i++) {
        SqoopHCatInputSplit sqoopSplit = new SqoopHCatInputSplit(combinedSplitList.get(i));
        combinedSplits.add(sqoopSplit);
    }

    return combinedSplits;

}

From source file:org.apache.tez.mapreduce.common.TestMRInputAMSplitGenerator.java

License:Apache License

private void testGroupSplitsAndSortSplits(boolean groupSplitsEnabled, boolean sortSplitsEnabled)
        throws Exception {
    Configuration conf = new Configuration();
    String[] splitLengths = new String[50];
    for (int i = 0; i < splitLengths.length; i++) {
        splitLengths[i] = Integer.toString(1000 * (i + 1));
    }/*from  ww w  .j  a v a 2  s.c om*/
    conf.setStrings(SPLITS_LENGTHS, splitLengths);
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, InputFormatForTest.class)
            .groupSplits(groupSplitsEnabled).sortSplits(sortSplitsEnabled).build();
    UserPayload userPayload = dataSource.getInputDescriptor().getUserPayload();

    InputInitializerContext context = new TezTestUtils.TezRootInputInitializerContextForTest(userPayload);
    MRInputAMSplitGenerator splitGenerator = new MRInputAMSplitGenerator(context);

    List<Event> events = splitGenerator.initialize();

    assertTrue(events.get(0) instanceof InputConfigureVertexTasksEvent);
    boolean shuffled = false;
    InputSplit previousIs = null;
    int numRawInputSplits = 0;
    for (int i = 1; i < events.size(); i++) {
        assertTrue(events.get(i) instanceof InputDataInformationEvent);
        InputDataInformationEvent diEvent = (InputDataInformationEvent) (events.get(i));
        assertNull(diEvent.getDeserializedUserPayload());
        assertNotNull(diEvent.getUserPayload());
        MRSplitProto eventProto = MRSplitProto.parseFrom(ByteString.copyFrom(diEvent.getUserPayload()));
        InputSplit is = MRInputUtils.getNewSplitDetailsFromEvent(eventProto, new Configuration());
        if (groupSplitsEnabled) {
            numRawInputSplits += ((TezGroupedSplit) is).getGroupedSplits().size();
            for (InputSplit inputSplit : ((TezGroupedSplit) is).getGroupedSplits()) {
                assertTrue(inputSplit instanceof InputSplitForTest);
            }
            assertTrue(((TezGroupedSplit) is).getGroupedSplits().get(0) instanceof InputSplitForTest);
        } else {
            numRawInputSplits++;
            assertTrue(is instanceof InputSplitForTest);
        }
        // The splits in the list returned from InputFormat has ascending
        // size in order.
        // If sortSplitsEnabled is true, MRInputAMSplitGenerator will sort the
        // splits in descending order.
        // If sortSplitsEnabled is false, MRInputAMSplitGenerator will shuffle
        // the splits.
        if (previousIs != null) {
            if (sortSplitsEnabled) {
                assertTrue(is.getLength() <= previousIs.getLength());
            } else {
                shuffled |= (is.getLength() > previousIs.getLength());
            }
        }
        previousIs = is;
    }
    assertEquals(splitLengths.length, numRawInputSplits);
    if (!sortSplitsEnabled) {
        assertTrue(shuffled);
    }
}

From source file:parquet.pig.ParquetLoader.java

License:Apache License

@Override
public ResourceStatistics getStatistics(String location, Job job) throws IOException {
    if (DEBUG)//from   w w  w.  j  a v  a2s .co m
        LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")");
    // We do not need to call setInput
    // as setLocation is guaranteed to be called before this
    long length = 0;
    try {
        for (InputSplit split : getParquetInputFormat().getSplits(job)) {
            length += split.getLength();
        }
    } catch (InterruptedException e) {
        LOG.warn("Interrupted: ", e);
        return null;
    }
    ResourceStatistics stats = new ResourceStatistics();
    // TODO use pig-0.12 setBytes api when its available
    stats.setmBytes(length / 1024 / 1024);
    return stats;
}