From 4e2ac0478f90ae53ee97f921eef14c524eb22eed Mon Sep 17 00:00:00 2001 From: Tim Head Date: Thu, 12 Sep 2019 07:43:47 +0200 Subject: [PATCH 01/10] Add function to rank buckets for a key The ranking is computed using the Rendez-vous hashing algorithm. --- binderhub/tests/test_utils.py | 25 +++++++++++++ binderhub/utils.py | 66 +++++++++++++++++++++++++++-------- 2 files changed, 77 insertions(+), 14 deletions(-) create mode 100644 binderhub/tests/test_utils.py diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py new file mode 100644 index 000000000..3370b71b4 --- /dev/null +++ b/binderhub/tests/test_utils.py @@ -0,0 +1,25 @@ +from binderhub import utils + + +def test_rendezvous_rank(): + # check that a key doesn't move if its assigned bucket remains but the + # other buckets are removed + key = "k1" + first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) + second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key) + + print(first_round) + + assert first_round[0] == second_round[0], key + + key = "sdsdggdddddd" + first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) + second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key) + + assert first_round[0] == second_round[0], key + + key = "crazy frog is a crazy key" + first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) + second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key) + + assert first_round[0] == second_round[0], key diff --git a/binderhub/utils.py b/binderhub/utils.py index 614fcaafc..38fd8380b 100644 --- a/binderhub/utils.py +++ b/binderhub/utils.py @@ -1,8 +1,37 @@ """Miscellaneous utilities""" from collections import OrderedDict +from hashlib import blake2b +from math import log + from traitlets import Integer, TraitError +def blake2b_as_float(b): + """Compute Blake2 digest and convert it to a float. + + Use this to create a float [0, 1) based on the Blake2 hash function. + """ + return ( + int.from_bytes(blake2b(b, digest_size=8).digest(), "big") / 0xFFFFFFFFFFFFFFFF + ) + + +def rendezvous_rank(buckets, key): + """Rank the buckets for a given key using Rendez-vous hashing + + Each bucket is scored for the key and the best match is assigned the + highest score. The return value is a list of tuples of (score, bucket), + sorted in decreasing order. + """ + ranking = [] + for bucket in buckets: + h = blake2b_as_float(b"%s-%s" % (str(key).encode(), str(bucket).encode())) + score = 1 / -log(h) + ranking.append((score, bucket)) + + return sorted(ranking, reverse=True) + + class ByteSpecification(Integer): """ Allow easily specifying bytes in units of 1024 with suffixes @@ -17,10 +46,10 @@ class ByteSpecification(Integer): """ UNIT_SUFFIXES = { - 'K': 1024, - 'M': 1024 * 1024, - 'G': 1024 * 1024 * 1024, - 'T': 1024 * 1024 * 1024 * 1024, + "K": 1024, + "M": 1024 * 1024, + "G": 1024 * 1024 * 1024, + "T": 1024 * 1024 * 1024 * 1024, } # Default to allowing None as a value @@ -40,16 +69,25 @@ def validate(self, obj, value): try: num = float(value[:-1]) except ValueError: - raise TraitError('{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T'.format(val=value)) + raise TraitError( + "{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T".format( + val=value + ) + ) suffix = value[-1] if suffix not in self.UNIT_SUFFIXES: - raise TraitError('{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T'.format(val=value)) + raise TraitError( + "{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T".format( + val=value + ) + ) else: return int(float(num) * self.UNIT_SUFFIXES[suffix]) class Cache(OrderedDict): """Basic LRU Cache with get/set""" + def __init__(self, max_size=1024): self.max_size = max_size @@ -83,17 +121,17 @@ def url_path_join(*pieces): Copied from `notebook.utils.url_path_join`. """ - initial = pieces[0].startswith('/') - final = pieces[-1].endswith('/') - stripped = [ s.strip('/') for s in pieces ] - result = '/'.join(s for s in stripped if s) + initial = pieces[0].startswith("/") + final = pieces[-1].endswith("/") + stripped = [s.strip("/") for s in pieces] + result = "/".join(s for s in stripped if s) if initial: - result = '/' + result + result = "/" + result if final: - result = result + '/' - if result == '//': - result = '/' + result = result + "/" + if result == "//": + result = "/" return result From d95d8d4d694b0c2a9f360fb0bb16db6ec4f08429 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Sun, 15 Sep 2019 18:49:41 +0200 Subject: [PATCH 02/10] Assign node affinity based on rendez-vous hash --- binderhub/app.py | 13 ++++++ binderhub/build.py | 82 ++++++++++++++++++++++++++--------- binderhub/builder.py | 3 +- binderhub/tests/test_build.py | 40 +++++++++++++++++ 4 files changed, 116 insertions(+), 22 deletions(-) diff --git a/binderhub/app.py b/binderhub/app.py index 278fe4d1c..c3e1c835a 100644 --- a/binderhub/app.py +++ b/binderhub/app.py @@ -196,6 +196,19 @@ def _valid_badge_base_url(self, proposal): config=True, ) + sticky_builds = Bool( + False, + help=""" + Attempt to assign builds for the same repository to the same node. + + In order to speed up re-builds of a repository all its builds will + be assigned to the same node in the cluster. + + Note: This feature only works if you also enable docker-in-docker support. + """, + config=True, + ) + use_registry = Bool( True, help=""" diff --git a/binderhub/build.py b/binderhub/build.py index ec17bd339..dfafcc5b2 100644 --- a/binderhub/build.py +++ b/binderhub/build.py @@ -12,6 +12,8 @@ from tornado.ioloop import IOLoop from tornado.log import app_log +from .utils import rendezvous_rank + class Build: """Represents a build of a git repository into a docker image. @@ -36,7 +38,7 @@ class Build: """ def __init__(self, q, api, name, namespace, repo_url, ref, git_credentials, build_image, image_name, push_secret, memory_limit, docker_host, node_selector, - appendix='', log_tail_lines=100): + appendix='', log_tail_lines=100, sticky_builds=False): self.q = q self.api = api self.repo_url = repo_url @@ -56,6 +58,10 @@ def __init__(self, q, api, name, namespace, repo_url, ref, git_credentials, buil self.stop_event = threading.Event() self.git_credentials = git_credentials + self.sticky_builds = sticky_builds + + self._component_label = "binderhub-build" + def get_cmd(self): """Get the cmd to run to build the image""" cmd = [ @@ -144,8 +150,59 @@ def progress(self, kind, obj): """Put the current action item into the queue for execution.""" self.main_loop.add_callback(self.q.put, {'kind': kind, 'payload': obj}) + def get_affinity(self): + """Determine the affinity term for the build pod. + + There are a total of two affinity strategies, which one is used depends + on how the BinderHub is configured. + + In the default setup the affinity of each build pod is an "anti-affinity" + which causes pods to prefer to schedule on separate nodes. + + In a setup with docker-in-docker pods for a particular repository + prefer to schedule on the same node in order to reuse the build cache + of previous builds. + """ + dind_pods = self.api.list_namespaced_pod( + self.namespace, + label_selector="component=dind,app=binder", + ) + if self.sticky_builds and dind_pods: + nodes = dict((item.spec.node_name, item.spec) + for item in dind_pods.items) + ranked_nodes = rendezvous_rank(list(nodes.keys), self.repo_url) + best_node_name = ranked_nodes[0][1] + # we reuse the affinity term of the DIND pod instead of + # constructing our own. But we want a soft preference instead of + # a hard preference in case that node is full or has gone away + # in the meantime + affinity = nodes[best_node_name].spec.affinity + affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = affinity.node_affinity.required_during_scheduling_ignored_during_execution + affinity.node_affinity.required_during_scheduling_ignored_during_execution = None + + else: + affinity = client.V1Affinity( + pod_anti_affinity=client.V1PodAntiAffinity( + preferred_during_scheduling_ignored_during_execution=[ + client.V1WeightedPodAffinityTerm( + weight=100, + pod_affinity_term=client.V1PodAffinityTerm( + topology_key="kubernetes.io/hostname", + label_selector=client.V1LabelSelector( + match_labels=dict( + component=self._component_label + ) + ) + ) + ) + ] + ) + ) + + return affinity + def submit(self): - """Submit a image spec to openshift's s2i and wait for completion """ + """Submit a build pod to create the image for the repository.""" volume_mounts = [ client.V1VolumeMount(mount_path="/var/run/docker.sock", name="docker-socket") ] @@ -166,13 +223,12 @@ def submit(self): if self.git_credentials: env.append(client.V1EnvVar(name='GIT_CREDENTIAL_ENV', value=self.git_credentials)) - component_label = "binderhub-build" self.pod = client.V1Pod( metadata=client.V1ObjectMeta( name=self.name, labels={ "name": self.name, - "component": component_label, + "component": self._component_label, }, annotations={ "binder-repo": self.repo_url, @@ -211,23 +267,7 @@ def submit(self): node_selector=self.node_selector, volumes=volumes, restart_policy="Never", - affinity=client.V1Affinity( - pod_anti_affinity=client.V1PodAntiAffinity( - preferred_during_scheduling_ignored_during_execution=[ - client.V1WeightedPodAffinityTerm( - weight=100, - pod_affinity_term=client.V1PodAffinityTerm( - topology_key="kubernetes.io/hostname", - label_selector=client.V1LabelSelector( - match_labels=dict( - component=component_label - ) - ) - ) - ) - ] - ) - ) + affinity=self.get_affinity() ) ) diff --git a/binderhub/builder.py b/binderhub/builder.py index 85d7fd958..463b68bb6 100644 --- a/binderhub/builder.py +++ b/binderhub/builder.py @@ -340,7 +340,8 @@ async def get(self, provider_prefix, _unescaped_spec): node_selector=self.settings['build_node_selector'], appendix=appendix, log_tail_lines=self.settings['log_tail_lines'], - git_credentials=provider.git_credentials + git_credentials=provider.git_credentials, + sticky_builds=self.settings['sticky_builds'], ) with BUILDS_INPROGRESS.track_inprogress(): diff --git a/binderhub/tests/test_build.py b/binderhub/tests/test_build.py index cca364e52..5b2d13a71 100644 --- a/binderhub/tests/test_build.py +++ b/binderhub/tests/test_build.py @@ -8,6 +8,8 @@ import pytest from tornado.httputil import url_concat +from kubernetes import client + from binderhub.build import Build from .utils import async_requests @@ -51,6 +53,44 @@ async def test_build(app, needs_build, needs_launch, always_build, slug, pytestc assert r.url.startswith(final['url']) +def test_default_affinity(): + # check that the default affinity is a pod anti-affinity + build = Build( + mock.MagicMock(), api=mock.MagicMock(), name='test_build', + namespace='build_namespace', repo_url=mock.MagicMock(), + ref=mock.MagicMock(), build_image=mock.MagicMock(), + image_name=mock.MagicMock(), push_secret=mock.MagicMock(), + memory_limit=mock.MagicMock(), git_credentials=None, + docker_host='http://mydockerregistry.local', + node_selector=mock.MagicMock()) + + affinity = build.get_affinity() + + assert isinstance(affinity, client.V1Affinity) + assert affinity.node_affinity is None + assert affinity.pod_affinity is None + assert affinity.pod_anti_affinity is not None + + +def test_sticky_builds_affinity(): + build = Build( + mock.MagicMock(), api=mock.MagicMock(), name='test_build', + namespace='build_namespace', repo_url=mock.MagicMock(), + ref=mock.MagicMock(), build_image=mock.MagicMock(), + image_name=mock.MagicMock(), push_secret=mock.MagicMock(), + memory_limit=mock.MagicMock(), git_credentials=None, + docker_host='http://mydockerregistry.local', + node_selector=mock.MagicMock(), + sticky_builds=True) + + affinity = build.get_affinity() + + assert isinstance(affinity, client.V1Affinity) + assert affinity.node_affinity is not None + assert affinity.pod_affinity is None + assert affinity.pod_anti_affinity is None + + def test_git_credentials_passed_to_podspec_upon_submit(): git_credentials = { 'client_id': 'my_username', From 33be8557725e6e1b3d4109a330642d949fc831b9 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Sun, 22 Sep 2019 14:52:42 +0200 Subject: [PATCH 03/10] Explicitly create affinity spec --- binderhub/app.py | 1 + binderhub/build.py | 45 ++++++++++++++++++++++------------- binderhub/tests/test_utils.py | 8 +++---- binderhub/utils.py | 21 +++++++--------- 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/binderhub/app.py b/binderhub/app.py index c3e1c835a..cb3864c25 100644 --- a/binderhub/app.py +++ b/binderhub/app.py @@ -568,6 +568,7 @@ def initialize(self, *args, **kwargs): "build_image": self.build_image, 'build_node_selector': self.build_node_selector, 'build_pool': self.build_pool, + "sticky_builds": self.sticky_builds, 'log_tail_lines': self.log_tail_lines, 'pod_quota': self.pod_quota, 'per_repo_quota': self.per_repo_quota, diff --git a/binderhub/build.py b/binderhub/build.py index dfafcc5b2..ad4abbb10 100644 --- a/binderhub/build.py +++ b/binderhub/build.py @@ -153,32 +153,43 @@ def progress(self, kind, obj): def get_affinity(self): """Determine the affinity term for the build pod. - There are a total of two affinity strategies, which one is used depends - on how the BinderHub is configured. + There are a two affinity strategies, which one is used depends on how + the BinderHub is configured. In the default setup the affinity of each build pod is an "anti-affinity" - which causes pods to prefer to schedule on separate nodes. + which causes the pods to prefer to schedule on separate nodes. - In a setup with docker-in-docker pods for a particular repository - prefer to schedule on the same node in order to reuse the build cache - of previous builds. + In a setup with docker-in-docker enabled pods for a particular + repository prefer to schedule on the same node in order to reuse the + docker layer cache of previous builds. """ dind_pods = self.api.list_namespaced_pod( self.namespace, label_selector="component=dind,app=binder", ) if self.sticky_builds and dind_pods: - nodes = dict((item.spec.node_name, item.spec) - for item in dind_pods.items) - ranked_nodes = rendezvous_rank(list(nodes.keys), self.repo_url) - best_node_name = ranked_nodes[0][1] - # we reuse the affinity term of the DIND pod instead of - # constructing our own. But we want a soft preference instead of - # a hard preference in case that node is full or has gone away - # in the meantime - affinity = nodes[best_node_name].spec.affinity - affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = affinity.node_affinity.required_during_scheduling_ignored_during_execution - affinity.node_affinity.required_during_scheduling_ignored_during_execution = None + node_names = [pod.spec.node_name for pod in dind_pods.items] + ranked_nodes = rendezvous_rank(node_names, self.repo_url) + best_node_name = ranked_nodes[0] + + affinity = client.V1Affinity( + node_affinity=client.V1NodeAffinity( + preferred_during_scheduling_ignored_during_execution=[ + client.V1PreferredSchedulingTerm( + weight=100, + preference=client.V1NodeSelectorTerm( + match_fields=[ + client.V1NodeSelectorRequirement( + key="metadata.name", + operator="In", + values=[best_node_name], + ) + ] + ), + ) + ] + ) + ) else: affinity = client.V1Affinity( diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py index 3370b71b4..e73c8120e 100644 --- a/binderhub/tests/test_utils.py +++ b/binderhub/tests/test_utils.py @@ -6,20 +6,18 @@ def test_rendezvous_rank(): # other buckets are removed key = "k1" first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) - second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key) - - print(first_round) + second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key) assert first_round[0] == second_round[0], key key = "sdsdggdddddd" first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) - second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key) + second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key) assert first_round[0] == second_round[0], key key = "crazy frog is a crazy key" first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) - second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key) + second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key) assert first_round[0] == second_round[0], key diff --git a/binderhub/utils.py b/binderhub/utils.py index 38fd8380b..94d0cebf4 100644 --- a/binderhub/utils.py +++ b/binderhub/utils.py @@ -1,35 +1,30 @@ """Miscellaneous utilities""" from collections import OrderedDict from hashlib import blake2b -from math import log from traitlets import Integer, TraitError -def blake2b_as_float(b): - """Compute Blake2 digest and convert it to a float. +def blake2b_as_int(b): + """Compute digest of the bytes `b` using the Blake2 hash function. - Use this to create a float [0, 1) based on the Blake2 hash function. + Returns a unsigned 64bit integer. """ - return ( - int.from_bytes(blake2b(b, digest_size=8).digest(), "big") / 0xFFFFFFFFFFFFFFFF - ) + return int.from_bytes(blake2b(b, digest_size=8).digest(), "big") def rendezvous_rank(buckets, key): """Rank the buckets for a given key using Rendez-vous hashing - Each bucket is scored for the key and the best match is assigned the - highest score. The return value is a list of tuples of (score, bucket), - sorted in decreasing order. + Each bucket is scored for the specified key. The return value is a list of + all buckets, sorted in decreasing order (highest ranked first). """ ranking = [] for bucket in buckets: - h = blake2b_as_float(b"%s-%s" % (str(key).encode(), str(bucket).encode())) - score = 1 / -log(h) + score = blake2b_as_int(b"%s-%s" % (str(key).encode(), str(bucket).encode())) ranking.append((score, bucket)) - return sorted(ranking, reverse=True) + return [b for (s, b) in sorted(ranking, reverse=True)] class ByteSpecification(Integer): From b58a84dffa3160621aa5a869124d19662fc576b9 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Sun, 22 Sep 2019 15:36:30 +0200 Subject: [PATCH 04/10] Finish sticky-build test --- binderhub/build.py | 1 + binderhub/tests/test_build.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/binderhub/build.py b/binderhub/build.py index ad4abbb10..bdd81e6dc 100644 --- a/binderhub/build.py +++ b/binderhub/build.py @@ -167,6 +167,7 @@ def get_affinity(self): self.namespace, label_selector="component=dind,app=binder", ) + if self.sticky_builds and dind_pods: node_names = [pod.spec.node_name for pod in dind_pods.items] ranked_nodes = rendezvous_rank(node_names, self.repo_url) diff --git a/binderhub/tests/test_build.py b/binderhub/tests/test_build.py index 5b2d13a71..60798a199 100644 --- a/binderhub/tests/test_build.py +++ b/binderhub/tests/test_build.py @@ -2,6 +2,7 @@ import json import sys +from collections import namedtuple from unittest import mock from urllib.parse import quote @@ -73,8 +74,18 @@ def test_default_affinity(): def test_sticky_builds_affinity(): + # Setup some mock objects for the response from the k8s API + Pod = namedtuple("Pod", "spec") + PodSpec = namedtuple("PodSpec", "node_name") + PodList = namedtuple("PodList", "items") + + mock_k8s_api = mock.MagicMock() + mock_k8s_api.list_namespaced_pod.return_value = PodList( + [Pod(PodSpec("node-a")), Pod(PodSpec("node-b"))], + ) + build = Build( - mock.MagicMock(), api=mock.MagicMock(), name='test_build', + mock.MagicMock(), api=mock_k8s_api, name='test_build', namespace='build_namespace', repo_url=mock.MagicMock(), ref=mock.MagicMock(), build_image=mock.MagicMock(), image_name=mock.MagicMock(), push_secret=mock.MagicMock(), @@ -90,6 +101,9 @@ def test_sticky_builds_affinity(): assert affinity.pod_affinity is None assert affinity.pod_anti_affinity is None + # One of the two nodes we have in our mock should be the preferred node + assert affinity.node_affinity.preferred_during_scheduling_ignored_during_execution[0].preference.match_fields[0].values[0] in ("node-a", "node-b") + def test_git_credentials_passed_to_podspec_upon_submit(): git_credentials = { From 7a2762804babd97f83a563750fbd0b49c02952ac Mon Sep 17 00:00:00 2001 From: Tim Head Date: Mon, 23 Sep 2019 08:13:35 +0200 Subject: [PATCH 05/10] Rename hash function and add comment about its use --- binderhub/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/binderhub/utils.py b/binderhub/utils.py index 94d0cebf4..9b3f7a7fa 100644 --- a/binderhub/utils.py +++ b/binderhub/utils.py @@ -5,7 +5,7 @@ from traitlets import Integer, TraitError -def blake2b_as_int(b): +def blake2b_hash_as_int(b): """Compute digest of the bytes `b` using the Blake2 hash function. Returns a unsigned 64bit integer. @@ -21,7 +21,12 @@ def rendezvous_rank(buckets, key): """ ranking = [] for bucket in buckets: - score = blake2b_as_int(b"%s-%s" % (str(key).encode(), str(bucket).encode())) + # The particular hash function doesn't matter a lot, as long as it is + # one that maps the key to a fixed sized value and distributes the keys + # uniformly across the output space + score = blake2b_hash_as_int( + b"%s-%s" % (str(key).encode(), str(bucket).encode()) + ) ranking.append((score, bucket)) return [b for (s, b) in sorted(ranking, reverse=True)] From 85ee797d3b773ae26ca642c040e61cb23a4fde29 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Mon, 23 Sep 2019 08:21:17 +0200 Subject: [PATCH 06/10] Add test to check relative ranking --- binderhub/tests/test_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py index e73c8120e..9c1c7ff7d 100644 --- a/binderhub/tests/test_utils.py +++ b/binderhub/tests/test_utils.py @@ -21,3 +21,15 @@ def test_rendezvous_rank(): second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key) assert first_round[0] == second_round[0], key + + +def test_rendezvous_independence(): + # check that the relative ranking of two buckets doesn't depend on the + # presence of a third bucket + key = "k1" + two_buckets = utils.rendezvous_rank(["b1", "b3"], key) + three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key) + # remove "b2" and check ranking + three_buckets.remove("b2") + + assert two_buckets == three_buckets From b28f5e3f321674a958ead4ad958abbfff2107dff Mon Sep 17 00:00:00 2001 From: Tim Head Date: Tue, 24 Sep 2019 15:17:13 +0200 Subject: [PATCH 07/10] Remove repeated tests --- binderhub/tests/test_utils.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py index 9c1c7ff7d..b0ef9b998 100644 --- a/binderhub/tests/test_utils.py +++ b/binderhub/tests/test_utils.py @@ -4,18 +4,6 @@ def test_rendezvous_rank(): # check that a key doesn't move if its assigned bucket remains but the # other buckets are removed - key = "k1" - first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) - second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key) - - assert first_round[0] == second_round[0], key - - key = "sdsdggdddddd" - first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) - second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key) - - assert first_round[0] == second_round[0], key - key = "crazy frog is a crazy key" first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key) second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key) From 83310fc5794a2690d070a73972973c4065c35b3b Mon Sep 17 00:00:00 2001 From: Tim Head Date: Tue, 24 Sep 2019 22:18:16 +0200 Subject: [PATCH 08/10] Add stability and redistribution tests --- binderhub/tests/test_utils.py | 47 +++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py index b0ef9b998..e45659823 100644 --- a/binderhub/tests/test_utils.py +++ b/binderhub/tests/test_utils.py @@ -12,12 +12,45 @@ def test_rendezvous_rank(): def test_rendezvous_independence(): - # check that the relative ranking of two buckets doesn't depend on the - # presence of a third bucket + # check that the relative ranking of 80 buckets doesn't depend on the + # presence of 20 extra buckets key = "k1" - two_buckets = utils.rendezvous_rank(["b1", "b3"], key) - three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key) - # remove "b2" and check ranking - three_buckets.remove("b2") + eighty_buckets = utils.rendezvous_rank(["b%i" % i for i in range(80)], key) + hundred_buckets = utils.rendezvous_rank(["b%i" % i for i in range(100)], key) - assert two_buckets == three_buckets + for i in range(80, 100): + hundred_buckets.remove("b%i" % i) + + assert eighty_buckets == hundred_buckets + + +def test_rendezvous_redistribution(): + # check that approximately a third of keys move to the new bucket + # when one is added + n_keys = 3000 + + # counnt how many keys were moved and which bucket were they moved from + n_moved = 0 + from_b1 = 0 + from_b2 = 0 + + for i in range(n_keys): + key = f"key-{i}" + two_buckets = utils.rendezvous_rank(["b1", "b2"], key) + three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key) + + if two_buckets[0] != three_buckets[0]: + n_moved += 1 + if two_buckets[0] == "b1": + from_b1 += 1 + if two_buckets[0] == "b2": + from_b2 += 1 + # should always move to the newly added bucket + assert three_buckets[0] == "b3" + + # because of statistical fluctuations we have to leave some room when + # making this comparison + assert 0.31 < n_moved / n_keys < 0.35 + # keys should move from the two original buckets with approximately + # equal probability + assert abs(from_b1 - from_b2) < 10 From e1567d1701aa7aadd95d0495ba47e9f38dad3ad5 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Wed, 25 Sep 2019 08:22:31 +0200 Subject: [PATCH 09/10] Check that we start with roughly equal bucket assignments --- binderhub/tests/test_utils.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py index e45659823..108397e9b 100644 --- a/binderhub/tests/test_utils.py +++ b/binderhub/tests/test_utils.py @@ -29,22 +29,22 @@ def test_rendezvous_redistribution(): # when one is added n_keys = 3000 - # counnt how many keys were moved and which bucket were they moved from + # count how many keys were moved, which bucket a key started from and + # which bucket a key was moved from (to the new bucket) n_moved = 0 - from_b1 = 0 - from_b2 = 0 + from_bucket = {"b1": 0, "b2": 0} + start_in = {"b1": 0, "b2": 0} for i in range(n_keys): key = f"key-{i}" two_buckets = utils.rendezvous_rank(["b1", "b2"], key) + start_in[two_buckets[0]] += 1 three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key) if two_buckets[0] != three_buckets[0]: n_moved += 1 - if two_buckets[0] == "b1": - from_b1 += 1 - if two_buckets[0] == "b2": - from_b2 += 1 + from_bucket[two_buckets[0]] += 1 + # should always move to the newly added bucket assert three_buckets[0] == "b3" @@ -53,4 +53,8 @@ def test_rendezvous_redistribution(): assert 0.31 < n_moved / n_keys < 0.35 # keys should move from the two original buckets with approximately # equal probability - assert abs(from_b1 - from_b2) < 10 + assert abs(from_bucket["b1"] - from_bucket["b2"]) < 10 + + # use the standard deviation of the expected number of entries in each + # bucket to get an idea for a reasonable (order of magnitude) difference + assert abs(start_in["b1"] - start_in["b2"]) < (n_keys/2)**0.5 From 34c9eeeb2113b13ba9905faf96e2e20805e56d55 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Wed, 25 Sep 2019 09:51:28 +0200 Subject: [PATCH 10/10] Extend rendezvous hash testing --- binderhub/tests/test_utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py index 108397e9b..fc681fba2 100644 --- a/binderhub/tests/test_utils.py +++ b/binderhub/tests/test_utils.py @@ -52,9 +52,8 @@ def test_rendezvous_redistribution(): # making this comparison assert 0.31 < n_moved / n_keys < 0.35 # keys should move from the two original buckets with approximately - # equal probability - assert abs(from_bucket["b1"] - from_bucket["b2"]) < 10 - - # use the standard deviation of the expected number of entries in each - # bucket to get an idea for a reasonable (order of magnitude) difference - assert abs(start_in["b1"] - start_in["b2"]) < (n_keys/2)**0.5 + # equal probability. We pick 30 because it is "about right" + assert abs(from_bucket["b1"] - from_bucket["b2"]) < 30 + # the initial distribution of keys should be roughly the same + # We pick 30 because it is "about right" + assert abs(start_in["b1"] - start_in["b2"]) < 30