List of usage examples for com.google.common.collect MinMaxPriorityQueue orderedBy
public static <B> Builder<B> orderedBy(Comparator<B> comparator)
From source file:io.druid.server.coordinator.DruidCoordinatorBalancerProfiler.java
public void profileRun() { Stopwatch watch = Stopwatch.createUnstarted(); LoadQueuePeonTester fromPeon = new LoadQueuePeonTester(); LoadQueuePeonTester toPeon = new LoadQueuePeonTester(); EasyMock.expect(druidServer1.getName()).andReturn("from").atLeastOnce(); EasyMock.expect(druidServer1.getCurrSize()).andReturn(30L).atLeastOnce(); EasyMock.expect(druidServer1.getMaxSize()).andReturn(100L).atLeastOnce(); EasyMock.expect(druidServer1.getSegments()).andReturn(segments).anyTimes(); EasyMock.expect(druidServer1.getSegment(EasyMock.<String>anyObject())).andReturn(null).anyTimes(); EasyMock.replay(druidServer1);//from w w w . jav a2 s. c o m EasyMock.expect(druidServer2.getName()).andReturn("to").atLeastOnce(); EasyMock.expect(druidServer2.getTier()).andReturn("normal").anyTimes(); EasyMock.expect(druidServer2.getCurrSize()).andReturn(0L).atLeastOnce(); EasyMock.expect(druidServer2.getMaxSize()).andReturn(100L).atLeastOnce(); EasyMock.expect(druidServer2.getSegments()).andReturn(new HashMap<String, DataSegment>()).anyTimes(); EasyMock.expect(druidServer2.getSegment(EasyMock.<String>anyObject())).andReturn(null).anyTimes(); EasyMock.replay(druidServer2); coordinator.moveSegment(EasyMock.<ImmutableDruidServer>anyObject(), EasyMock.<ImmutableDruidServer>anyObject(), EasyMock.<String>anyObject(), EasyMock.<LoadPeonCallback>anyObject()); EasyMock.expectLastCall().anyTimes(); EasyMock.replay(coordinator); DruidCoordinatorRuntimeParams params = DruidCoordinatorRuntimeParams.newBuilder() .withDruidCluster( new DruidCluster(ImmutableMap.<String, MinMaxPriorityQueue<ServerHolder>>of("normal", MinMaxPriorityQueue.orderedBy(DruidCoordinatorBalancerTester.percentUsedComparator) .create(Arrays.asList(new ServerHolder(druidServer1, fromPeon), new ServerHolder(druidServer2, toPeon)))))) .withLoadManagementPeons(ImmutableMap.<String, LoadQueuePeon>of("from", fromPeon, "to", toPeon)) .withAvailableSegments(segments.values()) .withDynamicConfigs( new CoordinatorDynamicConfig.Builder().withMaxSegmentsToMove(MAX_SEGMENTS_TO_MOVE).build()) .withBalancerReferenceTimestamp(new DateTime("2013-01-01")).build(); DruidCoordinatorBalancerTester tester = new DruidCoordinatorBalancerTester(coordinator); watch.start(); DruidCoordinatorRuntimeParams balanceParams = tester.run(params); System.out.println(watch.stop()); }
From source file:com.linkedin.pinot.controller.helix.core.relocation.RealtimeSegmentRelocator.java
/** * Given a realtime tag config and an ideal state, relocate the segments * which are completed but still hanging around on consuming servers, one replica at a time * @param realtimeTagConfig/*from w w w . j ava2 s . c o m*/ * @param idealState */ protected void relocateSegments(RealtimeTagConfig realtimeTagConfig, IdealState idealState) { List<String> consumingServers = _helixAdmin.getInstancesInClusterWithTag(_helixManager.getClusterName(), realtimeTagConfig.getConsumingServerTag()); if (consumingServers.isEmpty()) { throw new IllegalStateException( "Found no realtime consuming servers with tag " + realtimeTagConfig.getConsumingServerTag()); } List<String> completedServers = _helixAdmin.getInstancesInClusterWithTag(_helixManager.getClusterName(), realtimeTagConfig.getCompletedServerTag()); if (completedServers.isEmpty()) { throw new IllegalStateException( "Found no realtime completed servers with tag " + realtimeTagConfig.getCompletedServerTag()); } if (completedServers.size() < Integer.valueOf(idealState.getReplicas())) { throw new IllegalStateException("Number of completed servers: " + completedServers.size() + " is less than num replicas: " + idealState.getReplicas()); } // TODO: use segment assignment strategy to decide where to place relocated segment // create map of completed server name to num segments it holds. // This will help us decide which completed server to choose for replacing a consuming server Map<String, Integer> completedServerToNumSegments = new HashMap<>(completedServers.size()); for (String server : completedServers) { completedServerToNumSegments.put(server, 0); } for (String segmentName : idealState.getPartitionSet()) { Map<String, String> instanceStateMap = idealState.getInstanceStateMap(segmentName); for (String instance : instanceStateMap.keySet()) { if (completedServers.contains(instance)) { completedServerToNumSegments.put(instance, completedServerToNumSegments.get(instance) + 1); } } } MinMaxPriorityQueue<Map.Entry<String, Integer>> completedServersQueue = MinMaxPriorityQueue .orderedBy(new Comparator<Map.Entry<String, Integer>>() { @Override public int compare(Map.Entry<String, Integer> entry1, Map.Entry<String, Integer> entry2) { return Integer.compare(entry1.getValue(), entry2.getValue()); } }).maximumSize(completedServers.size()).create(); completedServersQueue.addAll(completedServerToNumSegments.entrySet()); // get new mapping for segments that need relocation createNewIdealState(idealState, consumingServers, completedServersQueue); }
From source file:com.metamx.druid.utils.DruidMasterBalancerProfiler.java
public void profileRun() { Stopwatch watch = new Stopwatch(); LoadQueuePeonTester fromPeon = new LoadQueuePeonTester(); LoadQueuePeonTester toPeon = new LoadQueuePeonTester(); EasyMock.expect(druidServer1.getName()).andReturn("from").atLeastOnce(); EasyMock.expect(druidServer1.getCurrSize()).andReturn(30L).atLeastOnce(); EasyMock.expect(druidServer1.getMaxSize()).andReturn(100L).atLeastOnce(); EasyMock.expect(druidServer1.getSegments()).andReturn(segments).anyTimes(); EasyMock.expect(druidServer1.getSegment(EasyMock.<String>anyObject())).andReturn(null).anyTimes(); EasyMock.replay(druidServer1);/* ww w .j a v a2s . co m*/ EasyMock.expect(druidServer2.getName()).andReturn("to").atLeastOnce(); EasyMock.expect(druidServer2.getTier()).andReturn("normal").anyTimes(); EasyMock.expect(druidServer2.getCurrSize()).andReturn(0L).atLeastOnce(); EasyMock.expect(druidServer2.getMaxSize()).andReturn(100L).atLeastOnce(); EasyMock.expect(druidServer2.getSegments()).andReturn(new HashMap<String, DataSegment>()).anyTimes(); EasyMock.expect(druidServer2.getSegment(EasyMock.<String>anyObject())).andReturn(null).anyTimes(); EasyMock.replay(druidServer2); master.moveSegment(EasyMock.<String>anyObject(), EasyMock.<String>anyObject(), EasyMock.<String>anyObject(), EasyMock.<LoadPeonCallback>anyObject()); EasyMock.expectLastCall().anyTimes(); EasyMock.replay(master); DruidMasterRuntimeParams params = DruidMasterRuntimeParams.newBuilder() .withDruidCluster( new DruidCluster(ImmutableMap.<String, MinMaxPriorityQueue<ServerHolder>>of("normal", MinMaxPriorityQueue.orderedBy(DruidMasterBalancerTester.percentUsedComparator) .create(Arrays.asList(new ServerHolder(druidServer1, fromPeon), new ServerHolder(druidServer2, toPeon)))))) .withLoadManagementPeons(ImmutableMap.<String, LoadQueuePeon>of("from", fromPeon, "to", toPeon)) .withAvailableSegments(segments.values()) .withMasterSegmentSettings( new MasterSegmentSettings.Builder().withMaxSegmentsToMove(MAX_SEGMENTS_TO_MOVE).build()) .withBalancerReferenceTimestamp(new DateTime("2013-01-01")).build(); DruidMasterBalancerTester tester = new DruidMasterBalancerTester(master); watch.start(); DruidMasterRuntimeParams balanceParams = tester.run(params); System.out.println(watch.stop()); }
From source file:it.uniroma3.mat.extendedset.intset.ImmutableConciseSet.java
private static ImmutableConciseSet doUnion(Iterator<ImmutableConciseSet> sets) { IntList retVal = new IntList(); // lhs = current word position, rhs = the iterator // Comparison is first by index, then one fills > literals > zero fills // one fills are sorted by length (longer one fills have priority) // similarily, shorter zero fills have priority MinMaxPriorityQueue<WordHolder> theQ = MinMaxPriorityQueue.orderedBy(new Comparator<WordHolder>() { @Override/* www . j a va 2 s . co m*/ public int compare(WordHolder h1, WordHolder h2) { int w1 = h1.getWord(); int w2 = h2.getWord(); int s1 = h1.getIterator().startIndex; int s2 = h2.getIterator().startIndex; if (s1 != s2) { return compareInts(s1, s2); } if (ConciseSetUtils.isOneSequence(w1)) { if (ConciseSetUtils.isOneSequence(w2)) { return -compareInts(ConciseSetUtils.getSequenceNumWords(w1), ConciseSetUtils.getSequenceNumWords(w2)); } return -1; } else if (ConciseSetUtils.isLiteral(w1)) { if (ConciseSetUtils.isOneSequence(w2)) { return 1; } else if (ConciseSetUtils.isLiteral(w2)) { return 0; } return -1; } else { if (!ConciseSetUtils.isZeroSequence(w2)) { return 1; } return compareInts(ConciseSetUtils.getSequenceNumWords(w1), ConciseSetUtils.getSequenceNumWords(w2)); } } }).create(); // populate priority queue while (sets.hasNext()) { ImmutableConciseSet set = sets.next(); if (set != null && !set.isEmpty()) { WordIterator itr = set.newWordIterator(); theQ.add(new WordHolder(itr.next(), itr)); } } int currIndex = 0; while (!theQ.isEmpty()) { // create a temp list to hold everything that will get pushed back into the priority queue after each run List<WordHolder> wordsToAdd = Lists.newArrayList(); // grab the top element from the priority queue WordHolder curr = theQ.poll(); int word = curr.getWord(); WordIterator itr = curr.getIterator(); // if the next word in the queue starts at a different point than where we ended off we need to create a zero gap // to fill the space if (currIndex < itr.startIndex) { addAndCompact(retVal, itr.startIndex - currIndex - 1); currIndex = itr.startIndex; } if (ConciseSetUtils.isOneSequence(word)) { // extract a literal from the flip bits of the one sequence int flipBitLiteral = ConciseSetUtils.getLiteralFromOneSeqFlipBit(word); // advance everything past the longest ones sequence WordHolder nextVal = theQ.peek(); while (nextVal != null && nextVal.getIterator().startIndex < itr.wordsWalked) { WordHolder entry = theQ.poll(); int w = entry.getWord(); WordIterator i = entry.getIterator(); if (i.startIndex == itr.startIndex) { // if a literal was created from a flip bit, OR it with other literals or literals from flip bits in the same // position if (ConciseSetUtils.isOneSequence(w)) { flipBitLiteral |= ConciseSetUtils.getLiteralFromOneSeqFlipBit(w); } else if (ConciseSetUtils.isLiteral(w)) { flipBitLiteral |= w; } else { flipBitLiteral |= ConciseSetUtils.getLiteralFromZeroSeqFlipBit(w); } } i.advanceTo(itr.wordsWalked); if (i.hasNext()) { wordsToAdd.add(new WordHolder(i.next(), i)); } nextVal = theQ.peek(); } // advance longest one literal forward and push result back to priority queue // if a flip bit is still needed, put it in the correct position int newWord = word & 0xC1FFFFFF; if (flipBitLiteral != ConciseSetUtils.ALL_ONES_LITERAL) { flipBitLiteral ^= ConciseSetUtils.ALL_ONES_LITERAL; int position = Integer.numberOfTrailingZeros(flipBitLiteral) + 1; newWord |= (position << 25); } addAndCompact(retVal, newWord); currIndex = itr.wordsWalked; if (itr.hasNext()) { wordsToAdd.add(new WordHolder(itr.next(), itr)); } } else if (ConciseSetUtils.isLiteral(word)) { // advance all other literals WordHolder nextVal = theQ.peek(); while (nextVal != null && nextVal.getIterator().startIndex == itr.startIndex) { WordHolder entry = theQ.poll(); int w = entry.getWord(); WordIterator i = entry.getIterator(); // if we still have zero fills with flipped bits, OR them here if (ConciseSetUtils.isLiteral(w)) { word |= w; } else { int flipBitLiteral = ConciseSetUtils.getLiteralFromZeroSeqFlipBit(w); if (flipBitLiteral != ConciseSetUtils.ALL_ZEROS_LITERAL) { word |= flipBitLiteral; i.advanceTo(itr.wordsWalked); } } if (i.hasNext()) { wordsToAdd.add(new WordHolder(i.next(), i)); } nextVal = theQ.peek(); } // advance the set with the current literal forward and push result back to priority queue addAndCompact(retVal, word); currIndex++; if (itr.hasNext()) { wordsToAdd.add(new WordHolder(itr.next(), itr)); } } else { // zero fills int flipBitLiteral; WordHolder nextVal = theQ.peek(); while (nextVal != null && nextVal.getIterator().startIndex == itr.startIndex) { // check if literal can be created flip bits of other zero sequences WordHolder entry = theQ.poll(); int w = entry.getWord(); WordIterator i = entry.getIterator(); flipBitLiteral = ConciseSetUtils.getLiteralFromZeroSeqFlipBit(w); if (flipBitLiteral != ConciseSetUtils.ALL_ZEROS_LITERAL) { wordsToAdd.add(new WordHolder(flipBitLiteral, i)); } else if (i.hasNext()) { wordsToAdd.add(new WordHolder(i.next(), i)); } nextVal = theQ.peek(); } // check if a literal needs to be created from the flipped bits of this sequence flipBitLiteral = ConciseSetUtils.getLiteralFromZeroSeqFlipBit(word); if (flipBitLiteral != ConciseSetUtils.ALL_ZEROS_LITERAL) { wordsToAdd.add(new WordHolder(flipBitLiteral, itr)); } else if (itr.hasNext()) { wordsToAdd.add(new WordHolder(itr.next(), itr)); } } theQ.addAll(wordsToAdd); } if (retVal.isEmpty()) { return new ImmutableConciseSet(); } return new ImmutableConciseSet(IntBuffer.wrap(retVal.toArray())); }
From source file:org.apache.tephra.hbase.txprune.InvalidListPruningDebugTool.java
/** * Return a list of RegionPruneInfo. These regions are the ones that have the lowest prune upper bounds. * If -1 is passed in, all the regions and their prune upper bound will be returned. Note that only the regions * that are known to be live will be returned. * * @param numRegions number of regions//from ww w . j a v a 2 s . c o m * @param time time in milliseconds or relative time, regions recorded before the given time are returned * @return Map of region name and its prune upper bound */ @Override @SuppressWarnings("WeakerAccess") public SortedSet<RegionPruneInfoPretty> getIdleRegions(Integer numRegions, String time) throws IOException { List<RegionPruneInfo> regionPruneInfos = dataJanitorState.getPruneInfoForRegions(null); if (regionPruneInfos.isEmpty()) { return new TreeSet<>(); } // Create a set with region names Set<String> pruneRegionNameSet = new HashSet<>(); for (RegionPruneInfo regionPruneInfo : regionPruneInfos) { pruneRegionNameSet.add(regionPruneInfo.getRegionNameAsString()); } // Fetch the latest live regions RegionsAtTime latestRegions = getRegionsOnOrBeforeTime(NOW); // Fetch the regions at the given time RegionsAtTime timeRegions = getRegionsOnOrBeforeTime(time); Set<String> liveRegions = Sets.intersection(latestRegions.getRegions(), timeRegions.getRegions()); Set<String> liveRegionsWithPruneInfo = Sets.intersection(liveRegions, pruneRegionNameSet); List<RegionPruneInfo> liveRegionWithPruneInfoList = new ArrayList<>(); for (RegionPruneInfo regionPruneInfo : regionPruneInfos) { if (liveRegionsWithPruneInfo.contains(regionPruneInfo.getRegionNameAsString())) { liveRegionWithPruneInfoList.add(regionPruneInfo); } // Use the subset of live regions and prune regions regionPruneInfos = liveRegionWithPruneInfoList; } if (numRegions < 0) { numRegions = regionPruneInfos.size(); } Comparator<RegionPruneInfo> comparator = new Comparator<RegionPruneInfo>() { @Override public int compare(RegionPruneInfo o1, RegionPruneInfo o2) { int result = Long.compare(o1.getPruneUpperBound(), o2.getPruneUpperBound()); if (result == 0) { return o1.getRegionNameAsString().compareTo(o2.getRegionNameAsString()); } return result; } }; MinMaxPriorityQueue<RegionPruneInfoPretty> lowestPrunes = MinMaxPriorityQueue.orderedBy(comparator) .maximumSize(numRegions).create(); for (RegionPruneInfo pruneInfo : regionPruneInfos) { lowestPrunes.add(new RegionPruneInfoPretty(pruneInfo)); } SortedSet<RegionPruneInfoPretty> regions = new TreeSet<>(comparator); regions.addAll(lowestPrunes); return regions; }
From source file:org.apache.hadoop.hbase.master.balancer.DefaultLoadBalancer.java
/** * Generate a global load balancing plan according to the specified map of * server information to the most loaded regions of each server. * * The load balancing invariant is that all servers are within 1 region of the * average number of regions per server. If the average is an integer number, * all servers will be balanced to the average. Otherwise, all servers will * have either floor(average) or ceiling(average) regions. * * HBASE-3609 Modeled regionsToMove using Guava's MinMaxPriorityQueue so that * we can fetch from both ends of the queue. * At the beginning, we check whether there was empty region server * just discovered by Master. If so, we alternately choose new / old * regions from head / tail of regionsToMove, respectively. This alternation * avoids clustering young regions on the newly discovered region server. * Otherwise, we choose new regions from head of regionsToMove. * //from ww w.j a va2 s .c o m * Another improvement from HBASE-3609 is that we assign regions from * regionsToMove to underloaded servers in round-robin fashion. * Previously one underloaded server would be filled before we move onto * the next underloaded server, leading to clustering of young regions. * * Finally, we randomly shuffle underloaded servers so that they receive * offloaded regions relatively evenly across calls to balanceCluster(). * * The algorithm is currently implemented as such: * * <ol> * <li>Determine the two valid numbers of regions each server should have, * <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average). * * <li>Iterate down the most loaded servers, shedding regions from each so * each server hosts exactly <b>MAX</b> regions. Stop once you reach a * server that already has <= <b>MAX</b> regions. * <p> * Order the regions to move from most recent to least. * * <li>Iterate down the least loaded servers, assigning regions so each server * has exactly </b>MIN</b> regions. Stop once you reach a server that * already has >= <b>MIN</b> regions. * * Regions being assigned to underloaded servers are those that were shed * in the previous step. It is possible that there were not enough * regions shed to fill each underloaded server to <b>MIN</b>. If so we * end up with a number of regions required to do so, <b>neededRegions</b>. * * It is also possible that we were able to fill each underloaded but ended * up with regions that were unassigned from overloaded servers but that * still do not have assignment. * * If neither of these conditions hold (no regions needed to fill the * underloaded servers, no regions leftover from overloaded servers), * we are done and return. Otherwise we handle these cases below. * * <li>If <b>neededRegions</b> is non-zero (still have underloaded servers), * we iterate the most loaded servers again, shedding a single server from * each (this brings them from having <b>MAX</b> regions to having * <b>MIN</b> regions). * * <li>We now definitely have more regions that need assignment, either from * the previous step or from the original shedding from overloaded servers. * Iterate the least loaded servers filling each to <b>MIN</b>. * * <li>If we still have more regions that need assignment, again iterate the * least loaded servers, this time giving each one (filling them to * </b>MAX</b>) until we run out. * * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> regions. * * In addition, any server hosting >= <b>MAX</b> regions is guaranteed * to end up with <b>MAX</b> regions at the end of the balancing. This * ensures the minimal number of regions possible are moved. * </ol> * * TODO: We can at-most reassign the number of regions away from a particular * server to be how many they report as most loaded. * Should we just keep all assignment in memory? Any objections? * Does this mean we need HeapSize on HMaster? Or just careful monitor? * (current thinking is we will hold all assignments in memory) * * @param clusterMap Map of regionservers and their load/region information to * a list of their most loaded regions * @return a list of regions to be moved, including source and destination, * or null if cluster is already balanced */ public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterMap) { boolean emptyRegionServerPresent = false; long startTime = System.currentTimeMillis(); ClusterLoadState cs = new ClusterLoadState(clusterMap); if (!this.needsBalance(cs)) return null; int numServers = cs.getNumServers(); NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad(); int numRegions = cs.getNumRegions(); int min = numRegions / numServers; int max = numRegions % numServers == 0 ? min : min + 1; // Using to check balance result. StringBuilder strBalanceParam = new StringBuilder(); strBalanceParam.append("Balance parameter: numRegions=").append(numRegions).append(", numServers=") .append(numServers).append(", max=").append(max).append(", min=").append(min); LOG.debug(strBalanceParam.toString()); // Balance the cluster // TODO: Look at data block locality or a more complex load to do this MinMaxPriorityQueue<RegionPlan> regionsToMove = MinMaxPriorityQueue.orderedBy(rpComparator).create(); List<RegionPlan> regionsToReturn = new ArrayList<RegionPlan>(); // Walk down most loaded, pruning each to the max int serversOverloaded = 0; // flag used to fetch regions from head and tail of list, alternately boolean fetchFromTail = false; Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>(); for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) { ServerAndLoad sal = server.getKey(); int regionCount = sal.getLoad(); if (regionCount <= max) { serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0)); break; } serversOverloaded++; List<HRegionInfo> regions = server.getValue(); int numToOffload = Math.min(regionCount - max, regions.size()); // account for the out-of-band regions which were assigned to this server // after some other region server crashed Collections.sort(regions, riComparator); int numTaken = 0; for (int i = 0; i <= numToOffload;) { HRegionInfo hri = regions.get(i); // fetch from head if (fetchFromTail) { hri = regions.get(regions.size() - 1 - i); } i++; // Don't rebalance meta regions. if (hri.isMetaRegion()) continue; regionsToMove.add(new RegionPlan(hri, sal.getServerName(), null)); numTaken++; if (numTaken >= numToOffload) break; // fetch in alternate order if there is new region server if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1) * numTaken)); } int totalNumMoved = regionsToMove.size(); // Walk down least loaded, filling each to the min int neededRegions = 0; // number of regions needed to bring all up to min fetchFromTail = false; Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>(); float average = (float) numRegions / numServers; // for logging int maxToTake = numRegions - (int) average; for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { if (maxToTake == 0) break; // no more to take int regionCount = server.getKey().getLoad(); if (regionCount >= min && regionCount > 0) { continue; // look for other servers which haven't reached min } int regionsToPut = min - regionCount; if (regionsToPut == 0) { regionsToPut = 1; maxToTake--; } underloadedServers.put(server.getKey().getServerName(), regionsToPut); } // number of servers that get new regions int serversUnderloaded = underloadedServers.size(); int incr = 1; List<ServerName> sns = Arrays .asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded])); Collections.shuffle(sns, RANDOM); while (regionsToMove.size() > 0) { int cnt = 0; int i = incr > 0 ? 0 : underloadedServers.size() - 1; for (; i >= 0 && i < underloadedServers.size(); i += incr) { if (regionsToMove.isEmpty()) break; ServerName si = sns.get(i); int numToTake = underloadedServers.get(si); if (numToTake == 0) continue; addRegionPlan(regionsToMove, fetchFromTail, si, regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } underloadedServers.put(si, numToTake - 1); cnt++; BalanceInfo bi = serverBalanceInfo.get(si); if (bi == null) { bi = new BalanceInfo(0, 0); serverBalanceInfo.put(si, bi); } bi.setNumRegionsAdded(bi.getNumRegionsAdded() + 1); } if (cnt == 0) break; // iterates underloadedServers in the other direction incr = -incr; } for (Integer i : underloadedServers.values()) { // If we still want to take some, increment needed neededRegions += i; } // If none needed to fill all to min and none left to drain all to max, // we are done if (neededRegions == 0 && regionsToMove.isEmpty()) { long endTime = System.currentTimeMillis(); LOG.info("Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; } // Need to do a second pass. // Either more regions to assign out or servers that are still underloaded // If we need more to fill min, grab one from each most loaded until enough if (neededRegions != 0) { // Walk down most loaded, grabbing one from each until we get enough for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) { BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); int idx = balanceInfo == null ? 0 : balanceInfo.getNextRegionForUnload(); if (idx >= server.getValue().size()) break; HRegionInfo region = server.getValue().get(idx); if (region.isMetaRegion()) continue; // Don't move meta regions. regionsToMove.add(new RegionPlan(region, server.getKey().getServerName(), null)); totalNumMoved++; if (--neededRegions == 0) { // No more regions needed, done shedding break; } } } // Now we have a set of regions that must be all assigned out // Assign each underloaded up to the min, then if leftovers, assign to max // Walk down least loaded, assigning to each to fill up to min for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if (regionCount >= min) break; BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); if (balanceInfo != null) { regionCount += balanceInfo.getNumRegionsAdded(); } if (regionCount >= min) { continue; } int numToTake = min - regionCount; int numTaken = 0; while (numTaken < numToTake && 0 < regionsToMove.size()) { addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); numTaken++; if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } } // If we still have regions to dish out, assign underloaded to max if (0 < regionsToMove.size()) { for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if (regionCount >= max) { break; } addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } if (regionsToMove.isEmpty()) { break; } } } long endTime = System.currentTimeMillis(); if (!regionsToMove.isEmpty() || neededRegions != 0) { // Emit data so can diagnose how balancer went astray. LOG.warn("regionsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded=" + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded); StringBuilder sb = new StringBuilder(); for (Map.Entry<ServerName, List<HRegionInfo>> e : clusterMap.entrySet()) { if (sb.length() > 0) sb.append(", "); sb.append(e.getKey().toString()); sb.append(" "); sb.append(e.getValue().size()); } LOG.warn("Input " + sb.toString()); } // All done! LOG.info("Done. Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; }
From source file:org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer.java
/** * Generate a global load balancing plan according to the specified map of * server information to the most loaded regions of each server. * * The load balancing invariant is that all servers are within 1 region of the * average number of regions per server. If the average is an integer number, * all servers will be balanced to the average. Otherwise, all servers will * have either floor(average) or ceiling(average) regions. * * HBASE-3609 Modeled regionsToMove using Guava's MinMaxPriorityQueue so that * we can fetch from both ends of the queue. * At the beginning, we check whether there was empty region server * just discovered by Master. If so, we alternately choose new / old * regions from head / tail of regionsToMove, respectively. This alternation * avoids clustering young regions on the newly discovered region server. * Otherwise, we choose new regions from head of regionsToMove. * // w w w . j a v a 2s .c o m * Another improvement from HBASE-3609 is that we assign regions from * regionsToMove to underloaded servers in round-robin fashion. * Previously one underloaded server would be filled before we move onto * the next underloaded server, leading to clustering of young regions. * * Finally, we randomly shuffle underloaded servers so that they receive * offloaded regions relatively evenly across calls to balanceCluster(). * * The algorithm is currently implemented as such: * * <ol> * <li>Determine the two valid numbers of regions each server should have, * <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average). * * <li>Iterate down the most loaded servers, shedding regions from each so * each server hosts exactly <b>MAX</b> regions. Stop once you reach a * server that already has <= <b>MAX</b> regions. * <p> * Order the regions to move from most recent to least. * * <li>Iterate down the least loaded servers, assigning regions so each server * has exactly </b>MIN</b> regions. Stop once you reach a server that * already has >= <b>MIN</b> regions. * * Regions being assigned to underloaded servers are those that were shed * in the previous step. It is possible that there were not enough * regions shed to fill each underloaded server to <b>MIN</b>. If so we * end up with a number of regions required to do so, <b>neededRegions</b>. * * It is also possible that we were able to fill each underloaded but ended * up with regions that were unassigned from overloaded servers but that * still do not have assignment. * * If neither of these conditions hold (no regions needed to fill the * underloaded servers, no regions leftover from overloaded servers), * we are done and return. Otherwise we handle these cases below. * * <li>If <b>neededRegions</b> is non-zero (still have underloaded servers), * we iterate the most loaded servers again, shedding a single server from * each (this brings them from having <b>MAX</b> regions to having * <b>MIN</b> regions). * * <li>We now definitely have more regions that need assignment, either from * the previous step or from the original shedding from overloaded servers. * Iterate the least loaded servers filling each to <b>MIN</b>. * * <li>If we still have more regions that need assignment, again iterate the * least loaded servers, this time giving each one (filling them to * </b>MAX</b>) until we run out. * * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> regions. * * In addition, any server hosting >= <b>MAX</b> regions is guaranteed * to end up with <b>MAX</b> regions at the end of the balancing. This * ensures the minimal number of regions possible are moved. * </ol> * * TODO: We can at-most reassign the number of regions away from a particular * server to be how many they report as most loaded. * Should we just keep all assignment in memory? Any objections? * Does this mean we need HeapSize on HMaster? Or just careful monitor? * (current thinking is we will hold all assignments in memory) * * @param clusterMap Map of regionservers and their load/region information to * a list of their most loaded regions * @return a list of regions to be moved, including source and destination, * or null if cluster is already balanced */ public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterMap) { List<RegionPlan> regionsToReturn = balanceMasterRegions(clusterMap); if (regionsToReturn != null) { return regionsToReturn; } filterExcludedServers(clusterMap); boolean emptyRegionServerPresent = false; long startTime = System.currentTimeMillis(); Collection<ServerName> backupMasters = getBackupMasters(); ClusterLoadState cs = new ClusterLoadState(masterServerName, backupMasters, backupMasterWeight, clusterMap); if (!this.needsBalance(cs)) return null; int numServers = cs.getNumServers(); NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad(); int numRegions = cs.getNumRegions(); float average = cs.getLoadAverage(); int max = (int) Math.ceil(average); int min = (int) average; // Using to check balance result. StringBuilder strBalanceParam = new StringBuilder(); strBalanceParam.append("Balance parameter: numRegions=").append(numRegions).append(", numServers=") .append(numServers).append(", numBackupMasters=").append(cs.getNumBackupMasters()) .append(", backupMasterWeight=").append(backupMasterWeight).append(", max=").append(max) .append(", min=").append(min); LOG.debug(strBalanceParam.toString()); // Balance the cluster // TODO: Look at data block locality or a more complex load to do this MinMaxPriorityQueue<RegionPlan> regionsToMove = MinMaxPriorityQueue.orderedBy(rpComparator).create(); regionsToReturn = new ArrayList<RegionPlan>(); // Walk down most loaded, pruning each to the max int serversOverloaded = 0; // flag used to fetch regions from head and tail of list, alternately boolean fetchFromTail = false; Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>(); for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) { ServerAndLoad sal = server.getKey(); int load = sal.getLoad(); if (load <= max) { serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0)); break; } serversOverloaded++; List<HRegionInfo> regions = server.getValue(); int w = 1; // Normal region server has weight 1 if (backupMasters != null && backupMasters.contains(sal.getServerName())) { w = backupMasterWeight; // Backup master has heavier weight } int numToOffload = Math.min((load - max) / w, regions.size()); // account for the out-of-band regions which were assigned to this server // after some other region server crashed Collections.sort(regions, riComparator); int numTaken = 0; for (int i = 0; i <= numToOffload;) { HRegionInfo hri = regions.get(i); // fetch from head if (fetchFromTail) { hri = regions.get(regions.size() - 1 - i); } i++; // Don't rebalance special regions. if (shouldBeOnMaster(hri) && masterServerName.equals(sal.getServerName())) continue; regionsToMove.add(new RegionPlan(hri, sal.getServerName(), null)); numTaken++; if (numTaken >= numToOffload) break; // fetch in alternate order if there is new region server if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1) * numTaken)); } int totalNumMoved = regionsToMove.size(); // Walk down least loaded, filling each to the min int neededRegions = 0; // number of regions needed to bring all up to min fetchFromTail = false; Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>(); int maxToTake = numRegions - min; for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { if (maxToTake == 0) break; // no more to take int load = server.getKey().getLoad(); if (load >= min && load > 0) { continue; // look for other servers which haven't reached min } int w = 1; // Normal region server has weight 1 if (backupMasters != null && backupMasters.contains(server.getKey().getServerName())) { w = backupMasterWeight; // Backup master has heavier weight } int regionsToPut = (min - load) / w; if (regionsToPut == 0) { regionsToPut = 1; } maxToTake -= regionsToPut; underloadedServers.put(server.getKey().getServerName(), regionsToPut); } // number of servers that get new regions int serversUnderloaded = underloadedServers.size(); int incr = 1; List<ServerName> sns = Arrays .asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded])); Collections.shuffle(sns, RANDOM); while (regionsToMove.size() > 0) { int cnt = 0; int i = incr > 0 ? 0 : underloadedServers.size() - 1; for (; i >= 0 && i < underloadedServers.size(); i += incr) { if (regionsToMove.isEmpty()) break; ServerName si = sns.get(i); int numToTake = underloadedServers.get(si); if (numToTake == 0) continue; addRegionPlan(regionsToMove, fetchFromTail, si, regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } underloadedServers.put(si, numToTake - 1); cnt++; BalanceInfo bi = serverBalanceInfo.get(si); if (bi == null) { bi = new BalanceInfo(0, 0); serverBalanceInfo.put(si, bi); } bi.setNumRegionsAdded(bi.getNumRegionsAdded() + 1); } if (cnt == 0) break; // iterates underloadedServers in the other direction incr = -incr; } for (Integer i : underloadedServers.values()) { // If we still want to take some, increment needed neededRegions += i; } // If none needed to fill all to min and none left to drain all to max, // we are done if (neededRegions == 0 && regionsToMove.isEmpty()) { long endTime = System.currentTimeMillis(); LOG.info("Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; } // Need to do a second pass. // Either more regions to assign out or servers that are still underloaded // If we need more to fill min, grab one from each most loaded until enough if (neededRegions != 0) { // Walk down most loaded, grabbing one from each until we get enough for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) { BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); int idx = balanceInfo == null ? 0 : balanceInfo.getNextRegionForUnload(); if (idx >= server.getValue().size()) break; HRegionInfo region = server.getValue().get(idx); if (region.isMetaRegion()) continue; // Don't move meta regions. regionsToMove.add(new RegionPlan(region, server.getKey().getServerName(), null)); totalNumMoved++; if (--neededRegions == 0) { // No more regions needed, done shedding break; } } } // Now we have a set of regions that must be all assigned out // Assign each underloaded up to the min, then if leftovers, assign to max // Walk down least loaded, assigning to each to fill up to min for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if (regionCount >= min) break; BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); if (balanceInfo != null) { regionCount += balanceInfo.getNumRegionsAdded(); } if (regionCount >= min) { continue; } int numToTake = min - regionCount; int numTaken = 0; while (numTaken < numToTake && 0 < regionsToMove.size()) { addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); numTaken++; if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } } // If we still have regions to dish out, assign underloaded to max if (0 < regionsToMove.size()) { for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); if (balanceInfo != null) { regionCount += balanceInfo.getNumRegionsAdded(); } if (regionCount >= max) { break; } addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } if (regionsToMove.isEmpty()) { break; } } } long endTime = System.currentTimeMillis(); if (!regionsToMove.isEmpty() || neededRegions != 0) { // Emit data so can diagnose how balancer went astray. LOG.warn("regionsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded=" + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded); StringBuilder sb = new StringBuilder(); for (Map.Entry<ServerName, List<HRegionInfo>> e : clusterMap.entrySet()) { if (sb.length() > 0) sb.append(", "); sb.append(e.getKey().toString()); sb.append(" "); sb.append(e.getValue().size()); } LOG.warn("Input " + sb.toString()); } // All done! LOG.info("Done. Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; }
From source file:com.alibaba.wasp.master.balancer.DefaultLoadBalancer.java
/** * Generate a global load balancing plan according to the specified map of * server information to the most loaded entityGroups of each server. * /*w w w .j a v a 2 s .com*/ * The load balancing invariant is that all servers are within 1 entityGroup of the * average number of entityGroups per server. If the average is an integer number, * all servers will be balanced to the average. Otherwise, all servers will * have either floor(average) or ceiling(average) entityGroups. * * HBASE-3609 Modeled entityGroupsToMove using Guava's MinMaxPriorityQueue so that * we can fetch from both ends of the queue. At the beginning, we check * whether there was empty entityGroup server just discovered by Master. If so, we * alternately choose new / old entityGroups from head / tail of entityGroupsToMove, * respectively. This alternation avoids clustering young entityGroups on the newly * discovered entityGroup server. Otherwise, we choose new entityGroups from head of * entityGroupsToMove. * * Another improvement from HBASE-3609 is that we assign entityGroups from * entityGroupsToMove to underloaded servers in round-robin fashion. Previously one * underloaded server would be filled before we move onto the next underloaded * server, leading to clustering of young entityGroups. * * Finally, we randomly shuffle underloaded servers so that they receive * offloaded entityGroups relatively evenly across calls to balanceCluster(). * * The algorithm is currently implemented as such: * * <ol> * <li>Determine the two valid numbers of entityGroups each server should have, * <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average). * * <li>Iterate down the most loaded servers, shedding entityGroups from each so * each server hosts exactly <b>MAX</b> entityGroups. Stop once you reach a server * that already has <= <b>MAX</b> entityGroups. * <p> * Order the entityGroups to move from most recent to least. * * <li>Iterate down the least loaded servers, assigning entityGroups so each server * has exactly </b>MIN</b> entityGroups. Stop once you reach a server that already * has >= <b>MIN</b> entityGroups. * * EntityGroups being assigned to underloaded servers are those that were shed in * the previous step. It is possible that there were not enough entityGroups shed * to fill each underloaded server to <b>MIN</b>. If so we end up with a * number of entityGroups required to do so, <b>neededEntityGroups</b>. * * It is also possible that we were able to fill each underloaded but ended up * with entityGroups that were unassigned from overloaded servers but that still do * not have assignment. * * If neither of these conditions hold (no entityGroups needed to fill the * underloaded servers, no entityGroups leftover from overloaded servers), we are * done and return. Otherwise we handle these cases below. * * <li>If <b>neededEntityGroups</b> is non-zero (still have underloaded servers), * we iterate the most loaded servers again, shedding a single server from * each (this brings them from having <b>MAX</b> entityGroups to having <b>MIN</b> * entityGroups). * * <li>We now definitely have more entityGroups that need assignment, either from * the previous step or from the original shedding from overloaded servers. * Iterate the least loaded servers filling each to <b>MIN</b>. * * <li>If we still have more entityGroups that need assignment, again iterate the * least loaded servers, this time giving each one (filling them to * </b>MAX</b>) until we run out. * * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> entityGroups. * * In addition, any server hosting >= <b>MAX</b> entityGroups is guaranteed to * end up with <b>MAX</b> entityGroups at the end of the balancing. This ensures * the minimal number of entityGroups possible are moved. * </ol> * * TODO: We can at-most reassign the number of entityGroups away from a particular * server to be how many they report as most loaded. Should we just keep all * assignment in memory? Any objections? Does this mean we need HeapSize on * HMaster? Or just careful monitor? (current thinking is we will hold all * assignments in memory) * * @param clusterState Map of entityGroupservers and their load/entityGroup information * to a list of their most loaded entityGroups * @return a list of entityGroups to be moved, including source and destination, or * null if cluster is already balanced */ public List<EntityGroupPlan> balanceCluster(Map<ServerName, List<EntityGroupInfo>> clusterMap) { boolean emptyFServerPresent = false; long startTime = System.currentTimeMillis(); ClusterLoadState cs = new ClusterLoadState(clusterMap); int numServers = cs.getNumServers(); if (numServers == 0) { LOG.debug("numServers=0 so skipping load balancing"); return null; } NavigableMap<ServerAndLoad, List<EntityGroupInfo>> serversByLoad = cs.getServersByLoad(); int numEntityGroups = cs.getNumEntityGroups(); if (!this.needsBalance(cs)) { // Skipped because no server outside (min,max) range float average = cs.getLoadAverage(); // for logging LOG.info("Skipping load balancing because balanced cluster; " + "servers=" + numServers + " " + "entityGroups=" + numEntityGroups + " average=" + average + " " + "mostloaded=" + serversByLoad.lastKey().getLoad() + " leastloaded=" + serversByLoad.firstKey().getLoad()); return null; } int min = numEntityGroups / numServers; int max = numEntityGroups % numServers == 0 ? min : min + 1; // Using to check balance result. StringBuilder strBalanceParam = new StringBuilder(); strBalanceParam.append("Balance parameter: numEntityGroups=").append(numEntityGroups) .append(", numServers=").append(numServers).append(", max=").append(max).append(", min=") .append(min); LOG.debug(strBalanceParam.toString()); // Balance the cluster // TODO: Look at data block locality or a more complex load to do this MinMaxPriorityQueue<EntityGroupPlan> entityGroupsToMove = MinMaxPriorityQueue.orderedBy(rpComparator) .create(); List<EntityGroupPlan> entityGroupsToReturn = new ArrayList<EntityGroupPlan>(); // Walk down most loaded, pruning each to the max int serversOverloaded = 0; // flag used to fetch entityGroups from head and tail of list, alternately boolean fetchFromTail = false; Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>(); for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.descendingMap().entrySet()) { ServerAndLoad sal = server.getKey(); int entityGroupCount = sal.getLoad(); if (entityGroupCount <= max) { serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0)); break; } serversOverloaded++; List<EntityGroupInfo> entityGroups = server.getValue(); int numToOffload = Math.min(entityGroupCount - max, entityGroups.size()); // account for the out-of-band entityGroups which were assigned to this server // after some other entityGroup server crashed Collections.sort(entityGroups, riComparator); int numTaken = 0; for (int i = 0; i <= numToOffload;) { EntityGroupInfo egInfo = entityGroups.get(i); // fetch from head if (fetchFromTail) { egInfo = entityGroups.get(entityGroups.size() - 1 - i); } i++; entityGroupsToMove.add(new EntityGroupPlan(egInfo, sal.getServerName(), null)); numTaken++; if (numTaken >= numToOffload) break; // fetch in alternate order if there is new entityGroup server if (emptyFServerPresent) { fetchFromTail = !fetchFromTail; } } serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1) * numTaken)); } int totalNumMoved = entityGroupsToMove.size(); // Walk down least loaded, filling each to the min int neededEntityGroups = 0; // number of entityGroups needed to bring all up to min fetchFromTail = false; Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>(); for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.entrySet()) { int entityGroupCount = server.getKey().getLoad(); if (entityGroupCount >= min) { break; } underloadedServers.put(server.getKey().getServerName(), min - entityGroupCount); } // number of servers that get new entityGroups int serversUnderloaded = underloadedServers.size(); int incr = 1; List<ServerName> sns = Arrays .asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded])); Collections.shuffle(sns, RANDOM); while (entityGroupsToMove.size() > 0) { int cnt = 0; int i = incr > 0 ? 0 : underloadedServers.size() - 1; for (; i >= 0 && i < underloadedServers.size(); i += incr) { if (entityGroupsToMove.isEmpty()) break; ServerName si = sns.get(i); int numToTake = underloadedServers.get(si); if (numToTake == 0) continue; addEntityGroupPlan(entityGroupsToMove, fetchFromTail, si, entityGroupsToReturn); if (emptyFServerPresent) { fetchFromTail = !fetchFromTail; } underloadedServers.put(si, numToTake - 1); cnt++; BalanceInfo bi = serverBalanceInfo.get(si); if (bi == null) { bi = new BalanceInfo(0, 0); serverBalanceInfo.put(si, bi); } bi.setNumEntityGroupsAdded(bi.getNumEntityGroupsAdded() + 1); } if (cnt == 0) break; // iterates underloadedServers in the other direction incr = -incr; } for (Integer i : underloadedServers.values()) { // If we still want to take some, increment needed neededEntityGroups += i; } // If none needed to fill all to min and none left to drain all to max, // we are done if (neededEntityGroups == 0 && entityGroupsToMove.isEmpty()) { long endTime = System.currentTimeMillis(); LOG.info("Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " entityGroups off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return entityGroupsToReturn; } // Need to do a second pass. // Either more entityGroups to assign out or servers that are still underloaded // If we need more to fill min, grab one from each most loaded until enough if (neededEntityGroups != 0) { // Walk down most loaded, grabbing one from each until we get enough for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.descendingMap() .entrySet()) { BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); int idx = balanceInfo == null ? 0 : balanceInfo.getNextEntityGroupForUnload(); if (idx >= server.getValue().size()) break; EntityGroupInfo entityGroup = server.getValue().get(idx); entityGroupsToMove.add(new EntityGroupPlan(entityGroup, server.getKey().getServerName(), null)); totalNumMoved++; if (--neededEntityGroups == 0) { // No more entityGroups needed, done shedding break; } } } // Now we have a set of entityGroups that must be all assigned out // Assign each underloaded up to the min, then if leftovers, assign to max // Walk down least loaded, assigning to each to fill up to min for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.entrySet()) { int entityGroupCount = server.getKey().getLoad(); if (entityGroupCount >= min) break; BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); if (balanceInfo != null) { entityGroupCount += balanceInfo.getNumEntityGroupsAdded(); } if (entityGroupCount >= min) { continue; } int numToTake = min - entityGroupCount; int numTaken = 0; while (numTaken < numToTake && 0 < entityGroupsToMove.size()) { addEntityGroupPlan(entityGroupsToMove, fetchFromTail, server.getKey().getServerName(), entityGroupsToReturn); numTaken++; if (emptyFServerPresent) { fetchFromTail = !fetchFromTail; } } } // If we still have entityGroups to dish out, assign underloaded to max if (0 < entityGroupsToMove.size()) { for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.entrySet()) { int entityGroupCount = server.getKey().getLoad(); if (entityGroupCount >= max) { break; } addEntityGroupPlan(entityGroupsToMove, fetchFromTail, server.getKey().getServerName(), entityGroupsToReturn); if (emptyFServerPresent) { fetchFromTail = !fetchFromTail; } if (entityGroupsToMove.isEmpty()) { break; } } } long endTime = System.currentTimeMillis(); if (!entityGroupsToMove.isEmpty() || neededEntityGroups != 0) { // Emit data so can diagnose how balancer went astray. LOG.warn("entityGroupsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded=" + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded); StringBuilder sb = new StringBuilder(); for (Map.Entry<ServerName, List<EntityGroupInfo>> e : clusterMap.entrySet()) { if (sb.length() > 0) sb.append(", "); sb.append(e.getKey().toString()); sb.append(" "); sb.append(e.getValue().size()); } LOG.warn("Input " + sb.toString()); } // All done! LOG.info("Done. Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " entityGroups off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return entityGroupsToReturn; }
From source file:org.apache.hadoop.hbase.master.DefaultLoadBalancer.java
/** * Generate a global load balancing plan according to the specified map of * server information to the most loaded regions of each server. * * The load balancing invariant is that all servers are within 1 region of the * average number of regions per server. If the average is an integer number, * all servers will be balanced to the average. Otherwise, all servers will * have either floor(average) or ceiling(average) regions. * * HBASE-3609 Modeled regionsToMove using Guava's MinMaxPriorityQueue so that * we can fetch from both ends of the queue. * At the beginning, we check whether there was empty region server * just discovered by Master. If so, we alternately choose new / old * regions from head / tail of regionsToMove, respectively. This alternation * avoids clustering young regions on the newly discovered region server. * Otherwise, we choose new regions from head of regionsToMove. * //from w w w .j av a2 s . co m * Another improvement from HBASE-3609 is that we assign regions from * regionsToMove to underloaded servers in round-robin fashion. * Previously one underloaded server would be filled before we move onto * the next underloaded server, leading to clustering of young regions. * * Finally, we randomly shuffle underloaded servers so that they receive * offloaded regions relatively evenly across calls to balanceCluster(). * * The algorithm is currently implemented as such: * * <ol> * <li>Determine the two valid numbers of regions each server should have, * <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average). * * <li>Iterate down the most loaded servers, shedding regions from each so * each server hosts exactly <b>MAX</b> regions. Stop once you reach a * server that already has <= <b>MAX</b> regions. * <p> * Order the regions to move from most recent to least. * * <li>Iterate down the least loaded servers, assigning regions so each server * has exactly </b>MIN</b> regions. Stop once you reach a server that * already has >= <b>MIN</b> regions. * * Regions being assigned to underloaded servers are those that were shed * in the previous step. It is possible that there were not enough * regions shed to fill each underloaded server to <b>MIN</b>. If so we * end up with a number of regions required to do so, <b>neededRegions</b>. * * It is also possible that we were able to fill each underloaded but ended * up with regions that were unassigned from overloaded servers but that * still do not have assignment. * * If neither of these conditions hold (no regions needed to fill the * underloaded servers, no regions leftover from overloaded servers), * we are done and return. Otherwise we handle these cases below. * * <li>If <b>neededRegions</b> is non-zero (still have underloaded servers), * we iterate the most loaded servers again, shedding a single server from * each (this brings them from having <b>MAX</b> regions to having * <b>MIN</b> regions). * * <li>We now definitely have more regions that need assignment, either from * the previous step or from the original shedding from overloaded servers. * Iterate the least loaded servers filling each to <b>MIN</b>. * * <li>If we still have more regions that need assignment, again iterate the * least loaded servers, this time giving each one (filling them to * </b>MAX</b>) until we run out. * * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> regions. * * In addition, any server hosting >= <b>MAX</b> regions is guaranteed * to end up with <b>MAX</b> regions at the end of the balancing. This * ensures the minimal number of regions possible are moved. * </ol> * * TODO: We can at-most reassign the number of regions away from a particular * server to be how many they report as most loaded. * Should we just keep all assignment in memory? Any objections? * Does this mean we need HeapSize on HMaster? Or just careful monitor? * (current thinking is we will hold all assignments in memory) * * @param clusterState Map of regionservers and their load/region information to * a list of their most loaded regions * @return a list of regions to be moved, including source and destination, * or null if cluster is already balanced */ public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterState) { boolean emptyRegionServerPresent = false; long startTime = System.currentTimeMillis(); int numServers = clusterState.size(); if (numServers == 0) { LOG.debug("numServers=0 so skipping load balancing"); return null; } NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = new TreeMap<ServerAndLoad, List<HRegionInfo>>(); int numRegions = 0; // Iterate so we can count regions as we build the map for (Map.Entry<ServerName, List<HRegionInfo>> server : clusterState.entrySet()) { List<HRegionInfo> regions = server.getValue(); int sz = regions.size(); if (sz == 0) emptyRegionServerPresent = true; numRegions += sz; serversByLoad.put(new ServerAndLoad(server.getKey(), sz), regions); } // Check if we even need to do any load balancing float average = (float) numRegions / numServers; // for logging // HBASE-3681 check sloppiness first int floor = (int) Math.floor(average * (1 - slop)); int ceiling = (int) Math.ceil(average * (1 + slop)); if (serversByLoad.lastKey().getLoad() <= ceiling && serversByLoad.firstKey().getLoad() >= floor) { // Skipped because no server outside (min,max) range LOG.info("Skipping load balancing because balanced cluster; " + "servers=" + numServers + " " + "regions=" + numRegions + " average=" + average + " " + "mostloaded=" + serversByLoad.lastKey().getLoad() + " leastloaded=" + serversByLoad.firstKey().getLoad()); return null; } int min = numRegions / numServers; int max = numRegions % numServers == 0 ? min : min + 1; // Using to check balance result. StringBuilder strBalanceParam = new StringBuilder(); strBalanceParam.append("Balance parameter: numRegions=").append(numRegions).append(", numServers=") .append(numServers).append(", max=").append(max).append(", min=").append(min); LOG.debug(strBalanceParam.toString()); // Balance the cluster // TODO: Look at data block locality or a more complex load to do this MinMaxPriorityQueue<RegionPlan> regionsToMove = MinMaxPriorityQueue.orderedBy(rpComparator).create(); List<RegionPlan> regionsToReturn = new ArrayList<RegionPlan>(); // Walk down most loaded, pruning each to the max int serversOverloaded = 0; // flag used to fetch regions from head and tail of list, alternately boolean fetchFromTail = false; Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>(); for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) { ServerAndLoad sal = server.getKey(); int regionCount = sal.getLoad(); if (regionCount <= max) { serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0)); break; } serversOverloaded++; List<HRegionInfo> regions = server.getValue(); int numToOffload = Math.min(regionCount - max, regions.size()); // account for the out-of-band regions which were assigned to this server // after some other region server crashed Collections.sort(regions, riComparator); int numTaken = 0; for (int i = 0; i <= numToOffload;) { HRegionInfo hri = regions.get(i); // fetch from head if (fetchFromTail) { hri = regions.get(regions.size() - 1 - i); } i++; // Don't rebalance meta regions. if (hri.isMetaRegion()) continue; regionsToMove.add(new RegionPlan(hri, sal.getServerName(), null)); numTaken++; if (numTaken >= numToOffload) break; // fetch in alternate order if there is new region server if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1) * numTaken)); } int totalNumMoved = regionsToMove.size(); // Walk down least loaded, filling each to the min int neededRegions = 0; // number of regions needed to bring all up to min fetchFromTail = false; Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>(); for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if (regionCount >= min) { break; } underloadedServers.put(server.getKey().getServerName(), min - regionCount); } // number of servers that get new regions int serversUnderloaded = underloadedServers.size(); int incr = 1; List<ServerName> sns = Arrays .asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded])); Collections.shuffle(sns, RANDOM); while (regionsToMove.size() > 0) { int cnt = 0; int i = incr > 0 ? 0 : underloadedServers.size() - 1; for (; i >= 0 && i < underloadedServers.size(); i += incr) { if (regionsToMove.isEmpty()) break; ServerName si = sns.get(i); int numToTake = underloadedServers.get(si); if (numToTake == 0) continue; addRegionPlan(regionsToMove, fetchFromTail, si, regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } underloadedServers.put(si, numToTake - 1); cnt++; BalanceInfo bi = serverBalanceInfo.get(si); if (bi == null) { bi = new BalanceInfo(0, 0); serverBalanceInfo.put(si, bi); } bi.setNumRegionsAdded(bi.getNumRegionsAdded() + 1); } if (cnt == 0) break; // iterates underloadedServers in the other direction incr = -incr; } for (Integer i : underloadedServers.values()) { // If we still want to take some, increment needed neededRegions += i; } // If none needed to fill all to min and none left to drain all to max, // we are done if (neededRegions == 0 && regionsToMove.isEmpty()) { long endTime = System.currentTimeMillis(); LOG.info("Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; } // Need to do a second pass. // Either more regions to assign out or servers that are still underloaded // If we need more to fill min, grab one from each most loaded until enough if (neededRegions != 0) { // Walk down most loaded, grabbing one from each until we get enough for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) { BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); int idx = balanceInfo == null ? 0 : balanceInfo.getNextRegionForUnload(); if (idx >= server.getValue().size()) break; HRegionInfo region = server.getValue().get(idx); if (region.isMetaRegion()) continue; // Don't move meta regions. regionsToMove.add(new RegionPlan(region, server.getKey().getServerName(), null)); totalNumMoved++; if (--neededRegions == 0) { // No more regions needed, done shedding break; } } } // Now we have a set of regions that must be all assigned out // Assign each underloaded up to the min, then if leftovers, assign to max // Walk down least loaded, assigning to each to fill up to min for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if (regionCount >= min) break; BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); if (balanceInfo != null) { regionCount += balanceInfo.getNumRegionsAdded(); } if (regionCount >= min) { continue; } int numToTake = min - regionCount; int numTaken = 0; while (numTaken < numToTake && 0 < regionsToMove.size()) { addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); numTaken++; if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } } // If we still have regions to dish out, assign underloaded to max if (0 < regionsToMove.size()) { for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if (regionCount >= max) { break; } addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } if (regionsToMove.isEmpty()) { break; } } } long endTime = System.currentTimeMillis(); if (!regionsToMove.isEmpty() || neededRegions != 0) { // Emit data so can diagnose how balancer went astray. LOG.warn("regionsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded=" + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded); StringBuilder sb = new StringBuilder(); for (Map.Entry<ServerName, List<HRegionInfo>> e : clusterState.entrySet()) { if (sb.length() > 0) sb.append(", "); sb.append(e.getKey().toString()); sb.append(" "); sb.append(e.getValue().size()); } LOG.warn("Input " + sb.toString()); } // All done! LOG.info("Done. Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; }
From source file:gobblin.source.extractor.extract.kafka.workunit.packer.KafkaWorkUnitPacker.java
/** * Pack a list of {@link WorkUnit}s into a smaller number of {@link MultiWorkUnit}s, * using the worst-fit-decreasing algorithm. * * Each {@link WorkUnit} is assigned to the {@link MultiWorkUnit} with the smallest load. */// w w w .java 2 s . com protected List<WorkUnit> worstFitDecreasingBinPacking(List<WorkUnit> groups, int numOfMultiWorkUnits) { // Sort workunit groups by data size desc Collections.sort(groups, LOAD_DESC_COMPARATOR); MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(LOAD_ASC_COMPARATOR) .expectedSize(numOfMultiWorkUnits).create(); for (int i = 0; i < numOfMultiWorkUnits; i++) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); setWorkUnitEstSize(multiWorkUnit, 0); pQueue.add(multiWorkUnit); } for (WorkUnit group : groups) { MultiWorkUnit lightestMultiWorkUnit = pQueue.poll(); addWorkUnitToMultiWorkUnit(group, lightestMultiWorkUnit); pQueue.add(lightestMultiWorkUnit); } logMultiWorkUnitInfo(pQueue); double minLoad = getWorkUnitEstLoad(pQueue.peekFirst()); double maxLoad = getWorkUnitEstLoad(pQueue.peekLast()); LOG.info(String.format("Min load of multiWorkUnit = %f; Max load of multiWorkUnit = %f; Diff = %f%%", minLoad, maxLoad, (maxLoad - minLoad) / maxLoad * 100.0)); this.state.setProp(MIN_MULTIWORKUNIT_LOAD, minLoad); this.state.setProp(MAX_MULTIWORKUNIT_LOAD, maxLoad); List<WorkUnit> multiWorkUnits = Lists.newArrayList(); multiWorkUnits.addAll(pQueue); return multiWorkUnits; }