Example usage for org.apache.hadoop.mapreduce InputSplit getLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLocations.

Prototype

public abstract String[] getLocations() throws IOException, InterruptedException;

Source Link

Document

Get the list of nodes by name where the data for the split would be local.

Usage

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

public static List<List<InputSplit>> getCombineGuaguaSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize) throws IOException, InterruptedException {
    ArrayList<Node> nodes = new ArrayList<Node>();
    HashMap<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;/*  ww  w  . j  av  a 2s  .  c  om*/
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }

    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            ArrayList<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        HashSet<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    LOG.info("Total input paths (combined) to process : {}", result.size());
    return result;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

public static List<List<InputSplit>> getCombineVarSelectSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize) throws IOException, InterruptedException {
    List<Node> nodes = new ArrayList<Node>();
    Map<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;/*  w w  w  .  j a v a  2s  .  c  om*/
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }

    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            List<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        List<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        Set<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    LOG.info("Total input paths (combined) to process : {}", result.size());
    return result;
}

From source file:org.apache.cassandra.hadoop.cql3.CqlPagingRecordReader.java

License:Apache License

public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = (ColumnFamilySplit) split;
    Configuration conf = context.getConfiguration();
    totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = ConfigHelper.getInputColumnFamily(conf);
    consistencyLevel = ConsistencyLevel.valueOf(ConfigHelper.getReadConsistencyLevel(conf));
    keyspace = ConfigHelper.getInputKeyspace(conf);
    columns = CqlConfigHelper.getInputcolumns(conf);
    userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

    try {//w  w  w. java2s.c om
        pageRowSize = Integer.parseInt(CqlConfigHelper.getInputPageRowSize(conf));
    } catch (NumberFormatException e) {
        pageRowSize = DEFAULT_CQL_PAGE_LIMIT;
    }

    partitioner = ConfigHelper.getInputPartitioner(context.getConfiguration());

    try {
        if (client != null)
            return;

        // create connection using thrift
        String[] locations = split.getLocations();
        Exception lastException = null;
        for (String location : locations) {
            int port = ConfigHelper.getInputRpcPort(conf);
            try {
                client = CqlPagingInputFormat.createAuthenticatedClient(location, port, conf);
                break;
            } catch (Exception e) {
                lastException = e;
                logger.warn("Failed to create authenticated client to {}:{}", location, port);
            }
        }
        if (client == null && lastException != null)
            throw lastException;

        // retrieve partition keys and cluster keys from system.schema_columnfamilies table
        retrieveKeys();

        client.set_keyspace(keyspace);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    rowIterator = new RowIterator();

    logger.debug("created {}", rowIterator);
}

From source file:org.apache.cassandra.hadoop.cql3.CqlRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = (ColumnFamilySplit) split;
    Configuration conf = HadoopCompat.getConfiguration(context);
    totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = ConfigHelper.getInputColumnFamily(conf);
    keyspace = ConfigHelper.getInputKeyspace(conf);
    partitioner = ConfigHelper.getInputPartitioner(conf);
    inputColumns = CqlConfigHelper.getInputcolumns(conf);
    userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

    try {//from w  ww.  j a v a  2 s  . c  o m
        if (cluster != null)
            return;

        // create a Cluster instance
        String[] locations = split.getLocations();
        cluster = CqlConfigHelper.getInputCluster(locations, conf);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    if (cluster != null)
        session = cluster.connect(quote(keyspace));

    if (session == null)
        throw new RuntimeException("Can't create connection session");

    //get negotiated serialization protocol
    nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt();

    // If the user provides a CQL query then we will use it without validation
    // otherwise we will fall back to building a query using the:
    //   inputColumns
    //   whereClauses
    cqlQuery = CqlConfigHelper.getInputCql(conf);
    // validate that the user hasn't tried to give us a custom query along with input columns
    // and where clauses
    if (StringUtils.isNotEmpty(cqlQuery)
            && (StringUtils.isNotEmpty(inputColumns) || StringUtils.isNotEmpty(userDefinedWhereClauses))) {
        throw new AssertionError("Cannot define a custom query with input columns and / or where clauses");
    }

    if (StringUtils.isEmpty(cqlQuery))
        cqlQuery = buildQuery();
    logger.debug("cqlQuery {}", cqlQuery);

    rowIterator = new RowIterator();
    logger.debug("created {}", rowIterator);
}

From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormat.java

License:Apache License

@VisibleForTesting
static Stream<String> getLocations(final List<WindowedDataSegment> segments,
        final org.apache.hadoop.mapred.InputFormat fio, final JobConf conf) {
    return segments.stream().sequential().flatMap((final WindowedDataSegment segment) -> {
        FileInputFormat.setInputPaths(conf, new Path(JobHelper.getURIFromSegment(segment.getSegment())));
        try {/*w w  w . j  a  v a  2s  .c  om*/
            return Arrays.stream(fio.getSplits(conf, 1))
                    .flatMap((final org.apache.hadoop.mapred.InputSplit split) -> {
                        try {
                            return Arrays.stream(split.getLocations());
                        } catch (final Exception e) {
                            logger.error(e, "Exception getting locations");
                            return Stream.empty();
                        }
                    });
        } catch (final Exception e) {
            logger.error(e, "Exception getting splits");
            return Stream.empty();
        }
    });
}

From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormatTest.java

License:Apache License

@Test
public void testGetLocationsSplitException() throws IOException {
    final InputFormat fio = EasyMock.mock(InputFormat.class);

    final org.apache.hadoop.mapred.InputSplit split = EasyMock.mock(org.apache.hadoop.mapred.InputSplit.class);

    EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split });
    EasyMock.expect(split.getLocations()).andThrow(new IOException("testing"));

    EasyMock.replay(fio, split);/*  w  ww  .j a v  a  2s .c  o  m*/

    Assert.assertEquals(0, DatasourceInputFormat.getLocations(segments1.subList(0, 1), fio, config).count());
}

From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormatTest.java

License:Apache License

@Test
public void testGetLocations() throws IOException {
    final InputFormat fio = EasyMock.mock(InputFormat.class);

    final org.apache.hadoop.mapred.InputSplit split = EasyMock.mock(org.apache.hadoop.mapred.InputSplit.class);

    EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split });
    EasyMock.expect(split.getLocations()).andReturn(new String[] { "s1", "s2" });

    EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split });
    EasyMock.expect(split.getLocations()).andReturn(new String[] { "s3" });

    EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split });
    EasyMock.expect(split.getLocations()).andReturn(new String[] { "s4", "s2" });

    EasyMock.replay(fio, split);/*from   www .  ja  v  a  2  s. c om*/

    Assert.assertArrayEquals(new String[] { "s1", "s2", "s3", "s4", "s2" },
            DatasourceInputFormat.getLocations(segments1, fio, config).toArray(String[]::new));
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.v2.HadoopV2Splitter.java

License:Apache License

/**
 * @param ctx Job context./*from ww w. j a v  a 2  s. co m*/
 * @return Collection of mapped splits.
 * @throws IgniteCheckedException If mapping failed.
 */
public static Collection<HadoopInputSplit> splitJob(JobContext ctx) throws IgniteCheckedException {
    try {
        InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(),
                ctx.getConfiguration());

        assert format != null;

        List<InputSplit> splits = format.getSplits(ctx);

        Collection<HadoopInputSplit> res = new ArrayList<>(splits.size());

        int id = 0;

        for (InputSplit nativeSplit : splits) {
            if (nativeSplit instanceof FileSplit) {
                FileSplit s = (FileSplit) nativeSplit;

                res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(),
                        s.getLength()));
            } else
                res.add(HadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations()));

            id++;
        }

        return res;
    } catch (IOException | ClassNotFoundException e) {
        throw new IgniteCheckedException(e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();

        throw new IgniteInterruptedCheckedException(e);
    }
}

From source file:org.apache.ignite.internal.processors.hadoop.v2.GridHadoopV2Splitter.java

License:Apache License

/**
 * @param ctx Job context.// w ww.  ja v  a2  s .  c  o  m
 * @return Collection of mapped splits.
 * @throws IgniteCheckedException If mapping failed.
 */
public static Collection<GridHadoopInputSplit> splitJob(JobContext ctx) throws IgniteCheckedException {
    try {
        InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(),
                ctx.getConfiguration());

        assert format != null;

        List<InputSplit> splits = format.getSplits(ctx);

        Collection<GridHadoopInputSplit> res = new ArrayList<>(splits.size());

        int id = 0;

        for (InputSplit nativeSplit : splits) {
            if (nativeSplit instanceof FileSplit) {
                FileSplit s = (FileSplit) nativeSplit;

                res.add(new GridHadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(),
                        s.getLength()));
            } else
                res.add(GridHadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations()));

            id++;
        }

        return res;
    } catch (IOException | ClassNotFoundException e) {
        throw new IgniteCheckedException(e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();

        throw new IgniteInterruptedCheckedException(e);
    }
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit.java

License:Apache License

@Override
@SuppressWarnings("unchecked")
public String[] getLocations() throws IOException, InterruptedException {
    if (locations == null) {
        HashMap<String, Long> locMap = new HashMap<String, Long>();
        Long lenInMap;/*from   w  w w.j av  a2 s  .  c  o  m*/
        for (InputSplit split : wrappedSplits) {
            String[] locs = split.getLocations();
            for (String loc : locs) {
                if ((lenInMap = locMap.get(loc)) == null)
                    locMap.put(loc, split.getLength());
                else
                    locMap.put(loc, lenInMap + split.getLength());
            }
        }
        Set<Map.Entry<String, Long>> entrySet = locMap.entrySet();
        Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]);
        Arrays.sort(hostSize, new Comparator<Map.Entry<String, Long>>() {

            @Override
            public int compare(Entry<String, Long> o1, Entry<String, Long> o2) {
                long diff = o1.getValue() - o2.getValue();
                if (diff < 0)
                    return 1;
                if (diff > 0)
                    return -1;
                return 0;
            }
        });
        // maximum 5 locations are in list: refer to PIG-1648 for more details
        int nHost = Math.min(hostSize.length, 5);
        locations = new String[nHost];
        for (int i = 0; i < nHost; ++i) {
            locations[i] = hostSize[i].getKey();
        }
    }
    return locations;
}