Java tutorial
package org.apache.helix.controller.strategy; /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Maps; import com.google.common.collect.Multimaps; import org.apache.helix.HelixManager; import org.apache.helix.ZNRecord; import org.apache.helix.api.State; import org.apache.helix.api.id.ParticipantId; import org.apache.helix.api.id.PartitionId; import org.apache.helix.api.id.ResourceId; import org.apache.helix.model.ResourceAssignment; import org.apache.log4j.Logger; import com.google.common.base.Function; import com.google.common.base.Functions; import com.google.common.collect.Lists; public class AutoRebalanceStrategy { private static Logger logger = Logger.getLogger(AutoRebalanceStrategy.class); private final ResourceId _resourceId; private final List<PartitionId> _partitions; private final LinkedHashMap<State, Integer> _states; private final int _maximumPerNode; private final ReplicaPlacementScheme _placementScheme; private Map<ParticipantId, Node> _nodeMap; private List<Node> _liveNodesList; private Map<Integer, State> _stateMap; private Map<Replica, Node> _preferredAssignment; private Map<Replica, Node> _existingPreferredAssignment; private Map<Replica, Node> _existingNonPreferredAssignment; private Set<Replica> _orphaned; /** * Initialize this strategy for a resource * @param resourceName the resource for which an assignment will be computed * @param partitions the partition names for the resource * @param states the states and the number of replicas that should be in each state * @param maximumPerNode the maximum number of replicas any note can hold * @param placementScheme the scheme to use for preferred replica locations. If null, this is * {@link DefaultPlacementScheme} */ public AutoRebalanceStrategy(String resourceName, final List<String> partitions, final LinkedHashMap<String, Integer> states, int maximumPerNode, ReplicaPlacementScheme placementScheme) { _resourceId = ResourceId.from(resourceName); _partitions = Lists.newArrayList(Lists.transform(partitions, new Function<String, PartitionId>() { @Override public PartitionId apply(String input) { return PartitionId.from(input); } })); _states = new LinkedHashMap<State, Integer>(); for (String state : states.keySet()) { _states.put(State.from(state), states.get(state)); } _maximumPerNode = maximumPerNode; if (placementScheme != null) { _placementScheme = placementScheme; } else { _placementScheme = new DefaultPlacementScheme(); } } /** * Initialize the strategy with a default placement scheme * @see #AutoRebalanceStrategy(String, List, LinkedHashMap, int, ReplicaPlacementScheme) */ public AutoRebalanceStrategy(String resourceName, final List<String> partitions, final LinkedHashMap<String, Integer> states) { this(resourceName, partitions, states, Integer.MAX_VALUE, new DefaultPlacementScheme()); } /** * Constructor to support logically-typed Helix components * @param resourceId the resource for which to compute an assignment * @param partitions the partitions of the resource * @param states the states and counts for each state * @param maximumPerNode the maximum number of replicas per node * @param placementScheme the scheme to use for preferred replica locations. If null, this is * {@link DefaultPlacementScheme} */ public AutoRebalanceStrategy(ResourceId resourceId, final List<PartitionId> partitions, final LinkedHashMap<State, Integer> states, int maximumPerNode, ReplicaPlacementScheme placementScheme) { _resourceId = resourceId; _partitions = partitions; _states = states; _maximumPerNode = maximumPerNode; if (placementScheme != null) { _placementScheme = placementScheme; } else { _placementScheme = new DefaultPlacementScheme(); } } /** * Wrap {@link #computePartitionAssignment(List, Map, List)} with a function that takes concrete * types * @param liveNodes list of live participant ids * @param currentMapping map of partition id to map of participant id to state * @param allNodes list of all participant ids * @return the preference list and replica mapping */ public ZNRecord typedComputePartitionAssignment(final List<ParticipantId> liveNodes, final Map<PartitionId, Map<ParticipantId, State>> currentMapping, final List<ParticipantId> allNodes) { Comparator<ParticipantId> nodeComparator = new NodeComparator(); Comparator<ParticipantId> currentStateNodeComparator = new CurrentStateNodeComparator(currentMapping); List<ParticipantId> sortedLiveNodes = new ArrayList<ParticipantId>(liveNodes); Collections.sort(sortedLiveNodes, currentStateNodeComparator); List<ParticipantId> sortedAllNodes = new ArrayList<ParticipantId>(allNodes); Collections.sort(sortedAllNodes, nodeComparator); List<String> sortedNodeNames = Lists .newArrayList(Lists.transform(sortedAllNodes, Functions.toStringFunction())); int numReplicas = countStateReplicas(); ZNRecord znRecord = new ZNRecord(_resourceId.stringify()); if (sortedLiveNodes.size() == 0) { return znRecord; } int distRemainder = (numReplicas * _partitions.size()) % sortedLiveNodes.size(); int distFloor = (numReplicas * _partitions.size()) / sortedLiveNodes.size(); _nodeMap = new HashMap<ParticipantId, Node>(); _liveNodesList = new ArrayList<Node>(); for (ParticipantId id : sortedAllNodes) { Node node = new Node(id); node.capacity = 0; node.hasCeilingCapacity = false; _nodeMap.put(id, node); } for (int i = 0; i < sortedLiveNodes.size(); i++) { boolean usingCeiling = false; int targetSize = (_maximumPerNode > 0) ? Math.min(distFloor, _maximumPerNode) : distFloor; if (distRemainder > 0 && targetSize < _maximumPerNode) { targetSize += 1; distRemainder = distRemainder - 1; usingCeiling = true; } Node node = _nodeMap.get(sortedLiveNodes.get(i)); node.isAlive = true; node.capacity = targetSize; node.hasCeilingCapacity = usingCeiling; _liveNodesList.add(node); } // compute states for all replica ids _stateMap = generateStateMap(); // compute the preferred mapping if all nodes were up _preferredAssignment = computePreferredPlacement(sortedNodeNames); // logger.info("preferred mapping:"+ preferredAssignment); // from current mapping derive the ones in preferred location // this will update the nodes with their current fill status _existingPreferredAssignment = computeExistingPreferredPlacement(currentMapping); // from current mapping derive the ones not in preferred location _existingNonPreferredAssignment = computeExistingNonPreferredPlacement(currentMapping); // compute orphaned replicas that are not assigned to any node _orphaned = computeOrphaned(); if (logger.isInfoEnabled()) { logger.info("orphan = " + _orphaned); } moveNonPreferredReplicasToPreferred(); assignOrphans(); moveExcessReplicas(); prepareResult(znRecord); return znRecord; } /** * Determine a preference list and mapping of partitions to nodes for all replicas * @param liveNodes the current list of live participants * @param currentMapping the current assignment of replicas to nodes * @param allNodes the full list of known nodes in the system * @return the preference list and replica mapping */ public ZNRecord computePartitionAssignment(final List<String> liveNodes, final Map<String, Map<String, String>> currentMapping, final List<String> allNodes) { Function<String, ParticipantId> participantConverter = new Function<String, ParticipantId>() { @Override public ParticipantId apply(String participantId) { return ParticipantId.from(participantId); } }; List<ParticipantId> typedLiveNodes = Lists.newArrayList(Lists.transform(liveNodes, participantConverter)); List<ParticipantId> typedAllNodes = Lists.newArrayList(Lists.transform(allNodes, participantConverter)); Map<PartitionId, Map<ParticipantId, State>> typedCurrentMapping = ResourceAssignment .replicaMapsFromStringMaps(currentMapping); return typedComputePartitionAssignment(typedLiveNodes, typedCurrentMapping, typedAllNodes); } /** * Move replicas assigned to non-preferred nodes if their current node is at capacity * and its preferred node is under capacity. */ private void moveNonPreferredReplicasToPreferred() { // iterate through non preferred and see if we can move them to the // preferred location if the donor has more than it should and stealer has // enough capacity Iterator<Entry<Replica, Node>> iterator = _existingNonPreferredAssignment.entrySet().iterator(); while (iterator.hasNext()) { Entry<Replica, Node> entry = iterator.next(); Replica replica = entry.getKey(); Node donor = entry.getValue(); Node receiver = _preferredAssignment.get(replica); if (donor.capacity < donor.currentlyAssigned && receiver.capacity > receiver.currentlyAssigned && receiver.canAdd(replica)) { donor.currentlyAssigned = donor.currentlyAssigned - 1; receiver.currentlyAssigned = receiver.currentlyAssigned + 1; donor.nonPreferred.remove(replica); receiver.preferred.add(replica); donor.newReplicas.remove(replica); receiver.newReplicas.add(replica); iterator.remove(); } } } /** * Slot in orphaned partitions randomly so as to maintain even load on live nodes. */ private void assignOrphans() { // now iterate over nodes and remaining orphaned partitions and assign // partitions randomly // Better to iterate over orphaned partitions first Iterator<Replica> it = _orphaned.iterator(); while (it.hasNext()) { Replica replica = it.next(); boolean added = false; int startIndex = computeRandomStartIndex(replica); for (int index = startIndex; index < startIndex + _liveNodesList.size(); index++) { Node receiver = _liveNodesList.get(index % _liveNodesList.size()); if (receiver.capacity > receiver.currentlyAssigned && receiver.canAdd(replica)) { receiver.currentlyAssigned = receiver.currentlyAssigned + 1; receiver.nonPreferred.add(replica); receiver.newReplicas.add(replica); added = true; break; } } if (!added) { // try adding the replica by making room for it added = assignOrphanByMakingRoom(replica); } if (added) { it.remove(); } } if (_orphaned.size() > 0 && logger.isInfoEnabled()) { logger.info("could not assign nodes to partitions: " + _orphaned); } } /** * If an orphan can't be assigned normally, see if a node can borrow capacity to accept it * @param replica The replica to assign * @return true if the assignment succeeded, false otherwise */ private boolean assignOrphanByMakingRoom(Replica replica) { Node capacityDonor = null; Node capacityAcceptor = null; int startIndex = computeRandomStartIndex(replica); for (int index = startIndex; index < startIndex + _liveNodesList.size(); index++) { Node current = _liveNodesList.get(index % _liveNodesList.size()); if (current.hasCeilingCapacity && current.capacity > current.currentlyAssigned && !current.canAddIfCapacity(replica) && capacityDonor == null) { // this node has space but cannot accept the node capacityDonor = current; } else if (!current.hasCeilingCapacity && current.capacity == current.currentlyAssigned && current.canAddIfCapacity(replica) && capacityAcceptor == null) { // this node would be able to accept the replica if it has ceiling capacity capacityAcceptor = current; } if (capacityDonor != null && capacityAcceptor != null) { break; } } if (capacityDonor != null && capacityAcceptor != null) { // transfer ceiling capacity and add the node capacityAcceptor.steal(capacityDonor, replica); return true; } return false; } /** * Move replicas from too-full nodes to nodes that can accept the replicas */ private void moveExcessReplicas() { // iterate over nodes and move extra load Iterator<Replica> it; for (Node donor : _liveNodesList) { if (donor.capacity < donor.currentlyAssigned) { Collections.sort(donor.nonPreferred); it = donor.nonPreferred.iterator(); while (it.hasNext()) { Replica replica = it.next(); int startIndex = computeRandomStartIndex(replica); for (int index = startIndex; index < startIndex + _liveNodesList.size(); index++) { Node receiver = _liveNodesList.get(index % _liveNodesList.size()); if (receiver.canAdd(replica)) { receiver.currentlyAssigned = receiver.currentlyAssigned + 1; receiver.nonPreferred.add(replica); donor.currentlyAssigned = donor.currentlyAssigned - 1; it.remove(); break; } } if (donor.capacity >= donor.currentlyAssigned) { break; } } if (donor.capacity < donor.currentlyAssigned) { logger.warn("Could not take partitions out of node:" + donor.id); } } } } /** * Update a ZNRecord with the results of the rebalancing. * @param znRecord */ private void prepareResult(ZNRecord znRecord) { // The map fields are keyed on partition name to a pair of node and state, i.e. it // indicates that the partition with given state is served by that node // // The list fields are also keyed on partition and list all the nodes serving that partition. // This is useful to verify that there is no node serving multiple replicas of the same // partition. Map<String, List<String>> newPreferences = new TreeMap<String, List<String>>(); for (PartitionId partition : _partitions) { String partitionName = partition.stringify(); znRecord.setMapField(partitionName, new TreeMap<String, String>()); znRecord.setListField(partitionName, new ArrayList<String>()); newPreferences.put(partitionName, new ArrayList<String>()); } // for preference lists, the rough priority that we want is: // [existing preferred, existing non-preferred, non-existing preferred, non-existing // non-preferred] for (Node node : _liveNodesList) { for (Replica replica : node.preferred) { if (node.newReplicas.contains(replica)) { newPreferences.get(replica.partition.toString()).add(node.id.toString()); } else { znRecord.getListField(replica.partition.toString()).add(node.id.toString()); } } } for (Node node : _liveNodesList) { for (Replica replica : node.nonPreferred) { if (node.newReplicas.contains(replica)) { newPreferences.get(replica.partition.toString()).add(node.id.toString()); } else { znRecord.getListField(replica.partition.toString()).add(node.id.toString()); } } } normalizePreferenceLists(znRecord.getListFields(), newPreferences); // generate preference maps based on the preference lists for (PartitionId partition : _partitions) { List<String> preferenceList = znRecord.getListField(partition.toString()); int i = 0; for (String participant : preferenceList) { znRecord.getMapField(partition.toString()).put(participant, _stateMap.get(i).toString()); i++; } } } /** * Adjust preference lists to reduce the number of same replicas on an instance. This will * separately normalize two sets of preference lists, and then append the results of the second * set to those of the first. This basically ensures that existing replicas are automatically * preferred. * @param preferenceLists map of (partition --> list of nodes) * @param newPreferences map containing node preferences not consistent with the current * assignment */ private void normalizePreferenceLists(Map<String, List<String>> preferenceLists, Map<String, List<String>> newPreferences) { Map<String, Map<String, Integer>> nodeReplicaCounts = new HashMap<String, Map<String, Integer>>(); for (String partition : preferenceLists.keySet()) { normalizePreferenceList(preferenceLists.get(partition), nodeReplicaCounts); } for (String partition : newPreferences.keySet()) { normalizePreferenceList(newPreferences.get(partition), nodeReplicaCounts); preferenceLists.get(partition).addAll(newPreferences.get(partition)); } } /** * Adjust a single preference list for replica assignment imbalance * @param preferenceList list of node names * @param nodeReplicaCounts map of (node --> state --> count) */ private void normalizePreferenceList(List<String> preferenceList, Map<String, Map<String, Integer>> nodeReplicaCounts) { // make this a LinkedHashSet to preserve iteration order Set<String> notAssigned = new LinkedHashSet<String>(preferenceList); List<String> newPreferenceList = new ArrayList<String>(); int replicas = Math.min(countStateReplicas(), preferenceList.size()); for (int i = 0; i < replicas; i++) { State state = _stateMap.get(i); String node = getMinimumNodeForReplica(state, notAssigned, nodeReplicaCounts); newPreferenceList.add(node); notAssigned.remove(node); Map<String, Integer> counts = nodeReplicaCounts.get(node); counts.put(state.toString(), counts.get(state.toString()) + 1); } preferenceList.clear(); preferenceList.addAll(newPreferenceList); } /** * Get the node which hosts the fewest of a given replica * @param state the state * @param nodes nodes to check * @param nodeReplicaCounts current assignment of replicas * @return the node most willing to accept the replica */ private String getMinimumNodeForReplica(State state, Set<String> nodes, Map<String, Map<String, Integer>> nodeReplicaCounts) { String minimalNode = null; int minimalCount = Integer.MAX_VALUE; for (String node : nodes) { int count = getReplicaCountForNode(state, node, nodeReplicaCounts); if (count < minimalCount) { minimalCount = count; minimalNode = node; } } return minimalNode; } /** * Safe check for the number of replicas of a given id assigned to a node * @param state the state to assign * @param node the node to check * @param nodeReplicaCounts a map of node to replica id and counts * @return the number of currently assigned replicas of the given id */ private int getReplicaCountForNode(State state, String node, Map<String, Map<String, Integer>> nodeReplicaCounts) { if (!nodeReplicaCounts.containsKey(node)) { Map<String, Integer> replicaCounts = new HashMap<String, Integer>(); replicaCounts.put(state.toString(), 0); nodeReplicaCounts.put(node, replicaCounts); return 0; } Map<String, Integer> replicaCounts = nodeReplicaCounts.get(node); if (!replicaCounts.containsKey(state.toString())) { replicaCounts.put(state.toString(), 0); return 0; } return replicaCounts.get(state.toString()); } /** * Compute the subset of the current mapping where replicas are not mapped according to their * preferred assignment. * @param currentMapping Current mapping of replicas to nodes * @return The current assignments that do not conform to the preferred assignment */ private Map<Replica, Node> computeExistingNonPreferredPlacement( Map<PartitionId, Map<ParticipantId, State>> currentMapping) { Map<Replica, Node> existingNonPreferredAssignment = new TreeMap<Replica, Node>(); int count = countStateReplicas(); for (PartitionId partition : currentMapping.keySet()) { Map<ParticipantId, State> nodeStateMap = currentMapping.get(partition); nodeStateMap.keySet().retainAll(_nodeMap.keySet()); for (ParticipantId nodeId : nodeStateMap.keySet()) { Node node = _nodeMap.get(nodeId); boolean skip = false; for (Replica replica : node.preferred) { if (replica.partition.equals(partition)) { skip = true; break; } } if (skip) { continue; } // check if its in one of the preferred position for (int replicaId = 0; replicaId < count; replicaId++) { Replica replica = new Replica(partition, replicaId); if (_preferredAssignment.get(replica).id != node.id && !_existingPreferredAssignment.containsKey(replica) && !existingNonPreferredAssignment.containsKey(replica)) { existingNonPreferredAssignment.put(replica, node); node.nonPreferred.add(replica); break; } } } } return existingNonPreferredAssignment; } /** * Get a live node index to try first for a replica so that each possible start index is * roughly uniformly assigned. * @param replica The replica to assign * @return The starting node index to try */ private int computeRandomStartIndex(final Replica replica) { return (replica.hashCode() & 0x7FFFFFFF) % _liveNodesList.size(); } /** * Get a set of replicas not currently assigned to any node * @return Unassigned replicas */ private Set<Replica> computeOrphaned() { Set<Replica> orphanedPartitions = new TreeSet<Replica>(_preferredAssignment.keySet()); for (Replica r : _existingPreferredAssignment.keySet()) { if (orphanedPartitions.contains(r)) { orphanedPartitions.remove(r); } } for (Replica r : _existingNonPreferredAssignment.keySet()) { if (orphanedPartitions.contains(r)) { orphanedPartitions.remove(r); } } return orphanedPartitions; } /** * Determine the replicas already assigned to their preferred nodes * @param currentMapping Current assignment of replicas to nodes * @return Assignments that conform to the preferred placement */ private Map<Replica, Node> computeExistingPreferredPlacement( final Map<PartitionId, Map<ParticipantId, State>> currentMapping) { Map<Replica, Node> existingPreferredAssignment = new TreeMap<Replica, Node>(); int count = countStateReplicas(); for (PartitionId partition : currentMapping.keySet()) { Map<ParticipantId, State> nodeStateMap = currentMapping.get(partition); nodeStateMap.keySet().retainAll(_nodeMap.keySet()); for (ParticipantId nodeId : nodeStateMap.keySet()) { Node node = _nodeMap.get(nodeId); node.currentlyAssigned = node.currentlyAssigned + 1; // check if its in one of the preferred position for (int replicaId = 0; replicaId < count; replicaId++) { Replica replica = new Replica(partition, replicaId); if (_preferredAssignment.containsKey(replica) && !existingPreferredAssignment.containsKey(replica) && _preferredAssignment.get(replica).id == node.id) { existingPreferredAssignment.put(replica, node); node.preferred.add(replica); break; } } } } return existingPreferredAssignment; } /** * Given a predefined set of all possible nodes, compute an assignment of replicas to * nodes that evenly assigns all replicas to nodes. * @param nodeNames Identifiers to all nodes, live and non-live * @return Preferred assignment of replicas */ private Map<Replica, Node> computePreferredPlacement(final List<String> nodeNames) { Map<Replica, Node> preferredMapping; preferredMapping = new HashMap<Replica, Node>(); int partitionId = 0; int numReplicas = countStateReplicas(); int count = countStateReplicas(); for (PartitionId partition : _partitions) { for (int replicaId = 0; replicaId < count; replicaId++) { Replica replica = new Replica(partition, replicaId); ParticipantId nodeName = ParticipantId.from(_placementScheme.getLocation(partitionId, replicaId, _partitions.size(), numReplicas, nodeNames)); preferredMapping.put(replica, _nodeMap.get(nodeName)); } partitionId = partitionId + 1; } return preferredMapping; } /** * Counts the total number of replicas given a state-count mapping * @return The number */ private int countStateReplicas() { int total = 0; for (Integer count : _states.values()) { total += count; } return total; } /** * Compute a map of replica ids to state names * @return Map: replica id -> state name */ private Map<Integer, State> generateStateMap() { int replicaId = 0; Map<Integer, State> stateMap = new HashMap<Integer, State>(); for (State state : _states.keySet()) { Integer count = _states.get(state); for (int i = 0; i < count; i++) { stateMap.put(replicaId, state); replicaId++; } } return stateMap; } /** * A Node is an entity that can serve replicas. It has a capacity and knowledge * of replicas assigned to it, so it can decide if it can receive additional replicas. */ class Node { public int currentlyAssigned; public int capacity; public boolean hasCeilingCapacity; private ParticipantId id; boolean isAlive; private List<Replica> preferred; private List<Replica> nonPreferred; private Set<Replica> newReplicas; public Node(ParticipantId id) { preferred = new ArrayList<Replica>(); nonPreferred = new ArrayList<Replica>(); newReplicas = new TreeSet<Replica>(); currentlyAssigned = 0; isAlive = false; this.id = id; } /** * Check if this replica can be legally added to this node * @param replica The replica to test * @return true if the assignment can be made, false otherwise */ public boolean canAdd(Replica replica) { if (currentlyAssigned >= capacity) { return false; } return canAddIfCapacity(replica); } /** * Check if this replica can be legally added to this node, provided that it has enough * capacity. * @param replica The replica to test * @return true if the assignment can be made, false otherwise */ public boolean canAddIfCapacity(Replica replica) { if (!isAlive) { return false; } for (Replica r : preferred) { if (r.partition.equals(replica.partition)) { return false; } } for (Replica r : nonPreferred) { if (r.partition.equals(replica.partition)) { return false; } } return true; } /** * Receive a replica by stealing capacity from another Node * @param donor The node that has excess capacity * @param replica The replica to receive */ public void steal(Node donor, Replica replica) { donor.hasCeilingCapacity = false; donor.capacity--; hasCeilingCapacity = true; capacity++; currentlyAssigned++; nonPreferred.add(replica); newReplicas.add(replica); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("##########\nname=").append(id.toString()).append("\npreferred:").append(preferred.size()) .append("\nnonpreferred:").append(nonPreferred.size()); return sb.toString(); } } /** * A Replica is a combination of a partition of the resource, the state the replica is in * and an identifier signifying a specific replica of a given partition and state. */ class Replica implements Comparable<Replica> { private PartitionId partition; private int replicaId; // this is a partition-relative id private String format; public Replica(PartitionId partition, int replicaId) { this.partition = partition; this.replicaId = replicaId; this.format = this.partition.toString() + "|" + this.replicaId; } @Override public String toString() { return format; } @Override public boolean equals(Object that) { if (that instanceof Replica) { return this.format.equals(((Replica) that).format); } return false; } @Override public int hashCode() { return this.format.hashCode(); } @Override public int compareTo(Replica that) { if (that instanceof Replica) { return this.format.compareTo(that.format); } return -1; } } /** * Interface for providing a custom approach to computing a replica's affinity to a node. */ public interface ReplicaPlacementScheme { /** * Initialize global state * @param manager The instance to which this placement is associated */ public void init(final HelixManager manager); /** * Given properties of this replica, determine the node it would prefer to be served by * @param partitionId The current partition * @param replicaId The current replica with respect to the current partition * @param numPartitions The total number of partitions * @param numReplicas The total number of replicas per partition * @param nodeNames A list of identifiers of all nodes, live and non-live * @return The name of the node that would prefer to serve this replica */ public String getLocation(int partitionId, int replicaId, int numPartitions, int numReplicas, final List<String> nodeNames); } /** * Compute preferred placements based on a default strategy that assigns replicas to nodes as * evenly as possible while avoiding placing two replicas of the same partition on any node. */ public static class DefaultPlacementScheme implements ReplicaPlacementScheme { @Override public void init(final HelixManager manager) { // do nothing since this is independent of the manager } @Override public String getLocation(int partitionId, int replicaId, int numPartitions, int numReplicas, final List<String> nodeNames) { int index; if (nodeNames.size() > numPartitions) { // assign replicas in partition order in case there are more nodes than partitions index = (partitionId + replicaId * numPartitions) % nodeNames.size(); } else if (nodeNames.size() == numPartitions) { // need a replica offset in case the sizes of these sets are the same index = ((partitionId + replicaId * numPartitions) % nodeNames.size() + replicaId) % nodeNames.size(); } else { // in all other cases, assigning a replica at a time for each partition is reasonable index = (partitionId + replicaId) % nodeNames.size(); } return nodeNames.get(index); } } private static class NodeComparator implements Comparator<ParticipantId> { @Override public int compare(ParticipantId o1, ParticipantId o2) { return o1.toString().compareTo(o2.toString()); } } /** * Sorter for live nodes that sorts firstly according to the number of partitions currently * registered against a node (more partitions means sort earlier), then by node name. * This prevents unnecessarily moving partitions due to the capacity assignment * unnecessarily reducing the capacity of lower down elements. */ private static class CurrentStateNodeComparator implements Comparator<ParticipantId> { /** * The number of partitions that are active for each partition. */ private final Map<ParticipantId, Integer> partitionCounts; /** * Create it. * @param currentMapping The current mapping of partitions to participants. */ public CurrentStateNodeComparator(Map<PartitionId, Map<ParticipantId, State>> currentMapping) { partitionCounts = new HashMap<ParticipantId, Integer>(); for (Entry<PartitionId, Map<ParticipantId, State>> entry : currentMapping.entrySet()) { for (ParticipantId participantId : entry.getValue().keySet()) { Integer existing = partitionCounts.get(participantId); partitionCounts.put(participantId, existing != null ? existing + 1 : 1); } } } @Override public int compare(ParticipantId o1, ParticipantId o2) { Integer c1 = partitionCounts.get(o1); if (c1 == null) { c1 = 0; } Integer c2 = partitionCounts.get(o2); if (c2 == null) { c2 = 0; } return c1 < c2 ? 1 : (c1 > c2 ? -1 : o1.toString().compareTo(o2.toString())); } } }