From 4e2ac0478f90ae53ee97f921eef14c524eb22eed Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Thu, 12 Sep 2019 07:43:47 +0200
Subject: [PATCH 01/10] Add function to rank buckets for a key

The ranking is computed using the Rendez-vous hashing algorithm.
---
 binderhub/tests/test_utils.py | 25 +++++++++++++
 binderhub/utils.py            | 66 +++++++++++++++++++++++++++--------
 2 files changed, 77 insertions(+), 14 deletions(-)
 create mode 100644 binderhub/tests/test_utils.py

diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py
new file mode 100644
index 000000000..3370b71b4
--- /dev/null
+++ b/binderhub/tests/test_utils.py
@@ -0,0 +1,25 @@
+from binderhub import utils
+
+
+def test_rendezvous_rank():
+    # check that a key doesn't move if its assigned bucket remains but the
+    # other buckets are removed
+    key = "k1"
+    first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
+    second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key)
+
+    print(first_round)
+
+    assert first_round[0] == second_round[0], key
+
+    key = "sdsdggdddddd"
+    first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
+    second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key)
+
+    assert first_round[0] == second_round[0], key
+
+    key = "crazy frog is a crazy key"
+    first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
+    second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key)
+
+    assert first_round[0] == second_round[0], key
diff --git a/binderhub/utils.py b/binderhub/utils.py
index 614fcaafc..38fd8380b 100644
--- a/binderhub/utils.py
+++ b/binderhub/utils.py
@@ -1,8 +1,37 @@
 """Miscellaneous utilities"""
 from collections import OrderedDict
+from hashlib import blake2b
+from math import log
+
 from traitlets import Integer, TraitError
 
 
+def blake2b_as_float(b):
+    """Compute Blake2 digest and convert it to a float.
+
+    Use this to create a float [0, 1) based on the Blake2 hash function.
+    """
+    return (
+        int.from_bytes(blake2b(b, digest_size=8).digest(), "big") / 0xFFFFFFFFFFFFFFFF
+    )
+
+
+def rendezvous_rank(buckets, key):
+    """Rank the buckets for a given key using Rendez-vous hashing
+
+    Each bucket is scored for the key and the best match is assigned the
+    highest score. The return value is a list of tuples of (score, bucket),
+    sorted in decreasing order.
+    """
+    ranking = []
+    for bucket in buckets:
+        h = blake2b_as_float(b"%s-%s" % (str(key).encode(), str(bucket).encode()))
+        score = 1 / -log(h)
+        ranking.append((score, bucket))
+
+    return sorted(ranking, reverse=True)
+
+
 class ByteSpecification(Integer):
     """
     Allow easily specifying bytes in units of 1024 with suffixes
@@ -17,10 +46,10 @@ class ByteSpecification(Integer):
     """
 
     UNIT_SUFFIXES = {
-        'K': 1024,
-        'M': 1024 * 1024,
-        'G': 1024 * 1024 * 1024,
-        'T': 1024 * 1024 * 1024 * 1024,
+        "K": 1024,
+        "M": 1024 * 1024,
+        "G": 1024 * 1024 * 1024,
+        "T": 1024 * 1024 * 1024 * 1024,
     }
 
     # Default to allowing None as a value
@@ -40,16 +69,25 @@ def validate(self, obj, value):
         try:
             num = float(value[:-1])
         except ValueError:
-            raise TraitError('{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T'.format(val=value))
+            raise TraitError(
+                "{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T".format(
+                    val=value
+                )
+            )
         suffix = value[-1]
         if suffix not in self.UNIT_SUFFIXES:
-            raise TraitError('{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T'.format(val=value))
+            raise TraitError(
+                "{val} is not a valid memory specification. Must be an int or a string with suffix K, M, G, T".format(
+                    val=value
+                )
+            )
         else:
             return int(float(num) * self.UNIT_SUFFIXES[suffix])
 
 
 class Cache(OrderedDict):
     """Basic LRU Cache with get/set"""
+
     def __init__(self, max_size=1024):
         self.max_size = max_size
 
@@ -83,17 +121,17 @@ def url_path_join(*pieces):
 
     Copied from `notebook.utils.url_path_join`.
     """
-    initial = pieces[0].startswith('/')
-    final = pieces[-1].endswith('/')
-    stripped = [ s.strip('/') for s in pieces ]
-    result = '/'.join(s for s in stripped if s)
+    initial = pieces[0].startswith("/")
+    final = pieces[-1].endswith("/")
+    stripped = [s.strip("/") for s in pieces]
+    result = "/".join(s for s in stripped if s)
 
     if initial:
-        result = '/' + result
+        result = "/" + result
     if final:
-        result = result + '/'
-    if result == '//':
-        result = '/'
+        result = result + "/"
+    if result == "//":
+        result = "/"
 
     return result
 

From d95d8d4d694b0c2a9f360fb0bb16db6ec4f08429 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Sun, 15 Sep 2019 18:49:41 +0200
Subject: [PATCH 02/10] Assign node affinity based on rendez-vous hash

---
 binderhub/app.py              | 13 ++++++
 binderhub/build.py            | 82 ++++++++++++++++++++++++++---------
 binderhub/builder.py          |  3 +-
 binderhub/tests/test_build.py | 40 +++++++++++++++++
 4 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/binderhub/app.py b/binderhub/app.py
index 278fe4d1c..c3e1c835a 100644
--- a/binderhub/app.py
+++ b/binderhub/app.py
@@ -196,6 +196,19 @@ def _valid_badge_base_url(self, proposal):
         config=True,
     )
 
+    sticky_builds = Bool(
+        False,
+        help="""
+        Attempt to assign builds for the same repository to the same node.
+
+        In order to speed up re-builds of a repository all its builds will
+        be assigned to the same node in the cluster.
+
+        Note: This feature only works if you also enable docker-in-docker support.
+        """,
+        config=True,
+    )
+
     use_registry = Bool(
         True,
         help="""
diff --git a/binderhub/build.py b/binderhub/build.py
index ec17bd339..dfafcc5b2 100644
--- a/binderhub/build.py
+++ b/binderhub/build.py
@@ -12,6 +12,8 @@
 from tornado.ioloop import IOLoop
 from tornado.log import app_log
 
+from .utils import rendezvous_rank
+
 
 class Build:
     """Represents a build of a git repository into a docker image.
@@ -36,7 +38,7 @@ class Build:
     """
     def __init__(self, q, api, name, namespace, repo_url, ref, git_credentials, build_image,
                  image_name, push_secret, memory_limit, docker_host, node_selector,
-                 appendix='', log_tail_lines=100):
+                 appendix='', log_tail_lines=100, sticky_builds=False):
         self.q = q
         self.api = api
         self.repo_url = repo_url
@@ -56,6 +58,10 @@ def __init__(self, q, api, name, namespace, repo_url, ref, git_credentials, buil
         self.stop_event = threading.Event()
         self.git_credentials = git_credentials
 
+        self.sticky_builds = sticky_builds
+
+        self._component_label = "binderhub-build"
+
     def get_cmd(self):
         """Get the cmd to run to build the image"""
         cmd = [
@@ -144,8 +150,59 @@ def progress(self, kind, obj):
         """Put the current action item into the queue for execution."""
         self.main_loop.add_callback(self.q.put, {'kind': kind, 'payload': obj})
 
+    def get_affinity(self):
+        """Determine the affinity term for the build pod.
+
+        There are a total of two affinity strategies, which one is used depends
+        on how the BinderHub is configured.
+
+        In the default setup the affinity of each build pod is an "anti-affinity"
+        which causes pods to prefer to schedule on separate nodes.
+
+        In a setup with docker-in-docker pods for a particular repository
+        prefer to schedule on the same node in order to reuse the build cache
+        of previous builds.
+        """
+        dind_pods = self.api.list_namespaced_pod(
+            self.namespace,
+            label_selector="component=dind,app=binder",
+        )
+        if self.sticky_builds and dind_pods:
+            nodes = dict((item.spec.node_name, item.spec)
+                         for item in dind_pods.items)
+            ranked_nodes = rendezvous_rank(list(nodes.keys), self.repo_url)
+            best_node_name = ranked_nodes[0][1]
+            # we reuse the affinity term of the DIND pod instead of
+            # constructing our own. But we want a soft preference instead of
+            # a hard preference in case that node is full or has gone away
+            # in the meantime
+            affinity = nodes[best_node_name].spec.affinity
+            affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = affinity.node_affinity.required_during_scheduling_ignored_during_execution
+            affinity.node_affinity.required_during_scheduling_ignored_during_execution = None
+
+        else:
+            affinity = client.V1Affinity(
+                pod_anti_affinity=client.V1PodAntiAffinity(
+                    preferred_during_scheduling_ignored_during_execution=[
+                        client.V1WeightedPodAffinityTerm(
+                            weight=100,
+                            pod_affinity_term=client.V1PodAffinityTerm(
+                                topology_key="kubernetes.io/hostname",
+                                label_selector=client.V1LabelSelector(
+                                    match_labels=dict(
+                                        component=self._component_label
+                                    )
+                                )
+                            )
+                        )
+                    ]
+                )
+            )
+
+        return affinity
+
     def submit(self):
-        """Submit a image spec to openshift's s2i and wait for completion """
+        """Submit a build pod to create the image for the repository."""
         volume_mounts = [
             client.V1VolumeMount(mount_path="/var/run/docker.sock", name="docker-socket")
         ]
@@ -166,13 +223,12 @@ def submit(self):
         if self.git_credentials:
             env.append(client.V1EnvVar(name='GIT_CREDENTIAL_ENV', value=self.git_credentials))
 
-        component_label = "binderhub-build"
         self.pod = client.V1Pod(
             metadata=client.V1ObjectMeta(
                 name=self.name,
                 labels={
                     "name": self.name,
-                    "component": component_label,
+                    "component": self._component_label,
                 },
                 annotations={
                     "binder-repo": self.repo_url,
@@ -211,23 +267,7 @@ def submit(self):
                 node_selector=self.node_selector,
                 volumes=volumes,
                 restart_policy="Never",
-                affinity=client.V1Affinity(
-                    pod_anti_affinity=client.V1PodAntiAffinity(
-                        preferred_during_scheduling_ignored_during_execution=[
-                            client.V1WeightedPodAffinityTerm(
-                                weight=100,
-                                pod_affinity_term=client.V1PodAffinityTerm(
-                                    topology_key="kubernetes.io/hostname",
-                                    label_selector=client.V1LabelSelector(
-                                        match_labels=dict(
-                                            component=component_label
-                                        )
-                                    )
-                                )
-                            )
-                        ]
-                    )
-                )
+                affinity=self.get_affinity()
             )
         )
 
diff --git a/binderhub/builder.py b/binderhub/builder.py
index 85d7fd958..463b68bb6 100644
--- a/binderhub/builder.py
+++ b/binderhub/builder.py
@@ -340,7 +340,8 @@ async def get(self, provider_prefix, _unescaped_spec):
             node_selector=self.settings['build_node_selector'],
             appendix=appendix,
             log_tail_lines=self.settings['log_tail_lines'],
-            git_credentials=provider.git_credentials
+            git_credentials=provider.git_credentials,
+            sticky_builds=self.settings['sticky_builds'],
         )
 
         with BUILDS_INPROGRESS.track_inprogress():
diff --git a/binderhub/tests/test_build.py b/binderhub/tests/test_build.py
index cca364e52..5b2d13a71 100644
--- a/binderhub/tests/test_build.py
+++ b/binderhub/tests/test_build.py
@@ -8,6 +8,8 @@
 import pytest
 from tornado.httputil import url_concat
 
+from kubernetes import client
+
 from binderhub.build import Build
 from .utils import async_requests
 
@@ -51,6 +53,44 @@ async def test_build(app, needs_build, needs_launch, always_build, slug, pytestc
     assert r.url.startswith(final['url'])
 
 
+def test_default_affinity():
+    # check that the default affinity is a pod anti-affinity
+    build = Build(
+        mock.MagicMock(), api=mock.MagicMock(), name='test_build',
+        namespace='build_namespace', repo_url=mock.MagicMock(),
+        ref=mock.MagicMock(), build_image=mock.MagicMock(),
+        image_name=mock.MagicMock(), push_secret=mock.MagicMock(),
+        memory_limit=mock.MagicMock(), git_credentials=None,
+        docker_host='http://mydockerregistry.local',
+        node_selector=mock.MagicMock())
+
+    affinity = build.get_affinity()
+
+    assert isinstance(affinity, client.V1Affinity)
+    assert affinity.node_affinity is None
+    assert affinity.pod_affinity is None
+    assert affinity.pod_anti_affinity is not None
+
+
+def test_sticky_builds_affinity():
+    build = Build(
+        mock.MagicMock(), api=mock.MagicMock(), name='test_build',
+        namespace='build_namespace', repo_url=mock.MagicMock(),
+        ref=mock.MagicMock(), build_image=mock.MagicMock(),
+        image_name=mock.MagicMock(), push_secret=mock.MagicMock(),
+        memory_limit=mock.MagicMock(), git_credentials=None,
+        docker_host='http://mydockerregistry.local',
+        node_selector=mock.MagicMock(),
+        sticky_builds=True)
+
+    affinity = build.get_affinity()
+
+    assert isinstance(affinity, client.V1Affinity)
+    assert affinity.node_affinity is not None
+    assert affinity.pod_affinity is None
+    assert affinity.pod_anti_affinity is None
+
+
 def test_git_credentials_passed_to_podspec_upon_submit():
     git_credentials = {
         'client_id': 'my_username',

From 33be8557725e6e1b3d4109a330642d949fc831b9 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Sun, 22 Sep 2019 14:52:42 +0200
Subject: [PATCH 03/10] Explicitly create affinity spec

---
 binderhub/app.py              |  1 +
 binderhub/build.py            | 45 ++++++++++++++++++++++-------------
 binderhub/tests/test_utils.py |  8 +++----
 binderhub/utils.py            | 21 +++++++---------
 4 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/binderhub/app.py b/binderhub/app.py
index c3e1c835a..cb3864c25 100644
--- a/binderhub/app.py
+++ b/binderhub/app.py
@@ -568,6 +568,7 @@ def initialize(self, *args, **kwargs):
             "build_image": self.build_image,
             'build_node_selector': self.build_node_selector,
             'build_pool': self.build_pool,
+            "sticky_builds": self.sticky_builds,
             'log_tail_lines': self.log_tail_lines,
             'pod_quota': self.pod_quota,
             'per_repo_quota': self.per_repo_quota,
diff --git a/binderhub/build.py b/binderhub/build.py
index dfafcc5b2..ad4abbb10 100644
--- a/binderhub/build.py
+++ b/binderhub/build.py
@@ -153,32 +153,43 @@ def progress(self, kind, obj):
     def get_affinity(self):
         """Determine the affinity term for the build pod.
 
-        There are a total of two affinity strategies, which one is used depends
-        on how the BinderHub is configured.
+        There are a two affinity strategies, which one is used depends on how
+        the BinderHub is configured.
 
         In the default setup the affinity of each build pod is an "anti-affinity"
-        which causes pods to prefer to schedule on separate nodes.
+        which causes the pods to prefer to schedule on separate nodes.
 
-        In a setup with docker-in-docker pods for a particular repository
-        prefer to schedule on the same node in order to reuse the build cache
-        of previous builds.
+        In a setup with docker-in-docker enabled pods for a particular
+        repository prefer to schedule on the same node in order to reuse the
+        docker layer cache of previous builds.
         """
         dind_pods = self.api.list_namespaced_pod(
             self.namespace,
             label_selector="component=dind,app=binder",
         )
         if self.sticky_builds and dind_pods:
-            nodes = dict((item.spec.node_name, item.spec)
-                         for item in dind_pods.items)
-            ranked_nodes = rendezvous_rank(list(nodes.keys), self.repo_url)
-            best_node_name = ranked_nodes[0][1]
-            # we reuse the affinity term of the DIND pod instead of
-            # constructing our own. But we want a soft preference instead of
-            # a hard preference in case that node is full or has gone away
-            # in the meantime
-            affinity = nodes[best_node_name].spec.affinity
-            affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = affinity.node_affinity.required_during_scheduling_ignored_during_execution
-            affinity.node_affinity.required_during_scheduling_ignored_during_execution = None
+            node_names = [pod.spec.node_name for pod in dind_pods.items]
+            ranked_nodes = rendezvous_rank(node_names, self.repo_url)
+            best_node_name = ranked_nodes[0]
+
+            affinity = client.V1Affinity(
+                node_affinity=client.V1NodeAffinity(
+                    preferred_during_scheduling_ignored_during_execution=[
+                        client.V1PreferredSchedulingTerm(
+                            weight=100,
+                            preference=client.V1NodeSelectorTerm(
+                                match_fields=[
+                                    client.V1NodeSelectorRequirement(
+                                        key="metadata.name",
+                                        operator="In",
+                                        values=[best_node_name],
+                                    )
+                                ]
+                            ),
+                        )
+                    ]
+                )
+            )
 
         else:
             affinity = client.V1Affinity(
diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py
index 3370b71b4..e73c8120e 100644
--- a/binderhub/tests/test_utils.py
+++ b/binderhub/tests/test_utils.py
@@ -6,20 +6,18 @@ def test_rendezvous_rank():
     # other buckets are removed
     key = "k1"
     first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
-    second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key)
-
-    print(first_round)
+    second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key)
 
     assert first_round[0] == second_round[0], key
 
     key = "sdsdggdddddd"
     first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
-    second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key)
+    second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key)
 
     assert first_round[0] == second_round[0], key
 
     key = "crazy frog is a crazy key"
     first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
-    second_round = utils.rendezvous_rank([first_round[0][1], first_round[1][1]], key)
+    second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key)
 
     assert first_round[0] == second_round[0], key
diff --git a/binderhub/utils.py b/binderhub/utils.py
index 38fd8380b..94d0cebf4 100644
--- a/binderhub/utils.py
+++ b/binderhub/utils.py
@@ -1,35 +1,30 @@
 """Miscellaneous utilities"""
 from collections import OrderedDict
 from hashlib import blake2b
-from math import log
 
 from traitlets import Integer, TraitError
 
 
-def blake2b_as_float(b):
-    """Compute Blake2 digest and convert it to a float.
+def blake2b_as_int(b):
+    """Compute digest of the bytes `b` using the Blake2 hash function.
 
-    Use this to create a float [0, 1) based on the Blake2 hash function.
+    Returns a unsigned 64bit integer.
     """
-    return (
-        int.from_bytes(blake2b(b, digest_size=8).digest(), "big") / 0xFFFFFFFFFFFFFFFF
-    )
+    return int.from_bytes(blake2b(b, digest_size=8).digest(), "big")
 
 
 def rendezvous_rank(buckets, key):
     """Rank the buckets for a given key using Rendez-vous hashing
 
-    Each bucket is scored for the key and the best match is assigned the
-    highest score. The return value is a list of tuples of (score, bucket),
-    sorted in decreasing order.
+    Each bucket is scored for the specified key. The return value is a list of
+    all buckets, sorted in decreasing order (highest ranked first).
     """
     ranking = []
     for bucket in buckets:
-        h = blake2b_as_float(b"%s-%s" % (str(key).encode(), str(bucket).encode()))
-        score = 1 / -log(h)
+        score = blake2b_as_int(b"%s-%s" % (str(key).encode(), str(bucket).encode()))
         ranking.append((score, bucket))
 
-    return sorted(ranking, reverse=True)
+    return [b for (s, b) in sorted(ranking, reverse=True)]
 
 
 class ByteSpecification(Integer):

From b58a84dffa3160621aa5a869124d19662fc576b9 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Sun, 22 Sep 2019 15:36:30 +0200
Subject: [PATCH 04/10] Finish sticky-build test

---
 binderhub/build.py            |  1 +
 binderhub/tests/test_build.py | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/binderhub/build.py b/binderhub/build.py
index ad4abbb10..bdd81e6dc 100644
--- a/binderhub/build.py
+++ b/binderhub/build.py
@@ -167,6 +167,7 @@ def get_affinity(self):
             self.namespace,
             label_selector="component=dind,app=binder",
         )
+
         if self.sticky_builds and dind_pods:
             node_names = [pod.spec.node_name for pod in dind_pods.items]
             ranked_nodes = rendezvous_rank(node_names, self.repo_url)
diff --git a/binderhub/tests/test_build.py b/binderhub/tests/test_build.py
index 5b2d13a71..60798a199 100644
--- a/binderhub/tests/test_build.py
+++ b/binderhub/tests/test_build.py
@@ -2,6 +2,7 @@
 
 import json
 import sys
+from collections import namedtuple
 from unittest import mock
 from urllib.parse import quote
 
@@ -73,8 +74,18 @@ def test_default_affinity():
 
 
 def test_sticky_builds_affinity():
+    # Setup some mock objects for the response from the k8s API
+    Pod = namedtuple("Pod", "spec")
+    PodSpec = namedtuple("PodSpec", "node_name")
+    PodList = namedtuple("PodList", "items")
+
+    mock_k8s_api = mock.MagicMock()
+    mock_k8s_api.list_namespaced_pod.return_value = PodList(
+        [Pod(PodSpec("node-a")), Pod(PodSpec("node-b"))],
+    )
+
     build = Build(
-        mock.MagicMock(), api=mock.MagicMock(), name='test_build',
+        mock.MagicMock(), api=mock_k8s_api, name='test_build',
         namespace='build_namespace', repo_url=mock.MagicMock(),
         ref=mock.MagicMock(), build_image=mock.MagicMock(),
         image_name=mock.MagicMock(), push_secret=mock.MagicMock(),
@@ -90,6 +101,9 @@ def test_sticky_builds_affinity():
     assert affinity.pod_affinity is None
     assert affinity.pod_anti_affinity is None
 
+    # One of the two nodes we have in our mock should be the preferred node
+    assert affinity.node_affinity.preferred_during_scheduling_ignored_during_execution[0].preference.match_fields[0].values[0] in ("node-a", "node-b")
+
 
 def test_git_credentials_passed_to_podspec_upon_submit():
     git_credentials = {

From 7a2762804babd97f83a563750fbd0b49c02952ac Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Mon, 23 Sep 2019 08:13:35 +0200
Subject: [PATCH 05/10] Rename hash function and add comment about its use

---
 binderhub/utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/binderhub/utils.py b/binderhub/utils.py
index 94d0cebf4..9b3f7a7fa 100644
--- a/binderhub/utils.py
+++ b/binderhub/utils.py
@@ -5,7 +5,7 @@
 from traitlets import Integer, TraitError
 
 
-def blake2b_as_int(b):
+def blake2b_hash_as_int(b):
     """Compute digest of the bytes `b` using the Blake2 hash function.
 
     Returns a unsigned 64bit integer.
@@ -21,7 +21,12 @@ def rendezvous_rank(buckets, key):
     """
     ranking = []
     for bucket in buckets:
-        score = blake2b_as_int(b"%s-%s" % (str(key).encode(), str(bucket).encode()))
+        # The particular hash function doesn't matter a lot, as long as it is
+        # one that maps the key to a fixed sized value and distributes the keys
+        # uniformly across the output space
+        score = blake2b_hash_as_int(
+            b"%s-%s" % (str(key).encode(), str(bucket).encode())
+        )
         ranking.append((score, bucket))
 
     return [b for (s, b) in sorted(ranking, reverse=True)]

From 85ee797d3b773ae26ca642c040e61cb23a4fde29 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Mon, 23 Sep 2019 08:21:17 +0200
Subject: [PATCH 06/10] Add test to check relative ranking

---
 binderhub/tests/test_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py
index e73c8120e..9c1c7ff7d 100644
--- a/binderhub/tests/test_utils.py
+++ b/binderhub/tests/test_utils.py
@@ -21,3 +21,15 @@ def test_rendezvous_rank():
     second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key)
 
     assert first_round[0] == second_round[0], key
+
+
+def test_rendezvous_independence():
+    # check that the relative ranking of two buckets doesn't depend on the
+    # presence of a third bucket
+    key = "k1"
+    two_buckets = utils.rendezvous_rank(["b1", "b3"], key)
+    three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key)
+    # remove "b2" and check ranking
+    three_buckets.remove("b2")
+
+    assert two_buckets == three_buckets

From b28f5e3f321674a958ead4ad958abbfff2107dff Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Tue, 24 Sep 2019 15:17:13 +0200
Subject: [PATCH 07/10] Remove repeated tests

---
 binderhub/tests/test_utils.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py
index 9c1c7ff7d..b0ef9b998 100644
--- a/binderhub/tests/test_utils.py
+++ b/binderhub/tests/test_utils.py
@@ -4,18 +4,6 @@
 def test_rendezvous_rank():
     # check that a key doesn't move if its assigned bucket remains but the
     # other buckets are removed
-    key = "k1"
-    first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
-    second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key)
-
-    assert first_round[0] == second_round[0], key
-
-    key = "sdsdggdddddd"
-    first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
-    second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key)
-
-    assert first_round[0] == second_round[0], key
-
     key = "crazy frog is a crazy key"
     first_round = utils.rendezvous_rank(["b1", "b2", "b3"], key)
     second_round = utils.rendezvous_rank([first_round[0], first_round[1]], key)

From 83310fc5794a2690d070a73972973c4065c35b3b Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Tue, 24 Sep 2019 22:18:16 +0200
Subject: [PATCH 08/10] Add stability and redistribution tests

---
 binderhub/tests/test_utils.py | 47 +++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py
index b0ef9b998..e45659823 100644
--- a/binderhub/tests/test_utils.py
+++ b/binderhub/tests/test_utils.py
@@ -12,12 +12,45 @@ def test_rendezvous_rank():
 
 
 def test_rendezvous_independence():
-    # check that the relative ranking of two buckets doesn't depend on the
-    # presence of a third bucket
+    # check that the relative ranking of 80 buckets doesn't depend on the
+    # presence of 20 extra buckets
     key = "k1"
-    two_buckets = utils.rendezvous_rank(["b1", "b3"], key)
-    three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key)
-    # remove "b2" and check ranking
-    three_buckets.remove("b2")
+    eighty_buckets = utils.rendezvous_rank(["b%i" % i for i in range(80)], key)
+    hundred_buckets = utils.rendezvous_rank(["b%i" % i for i in range(100)], key)
 
-    assert two_buckets == three_buckets
+    for i in range(80, 100):
+        hundred_buckets.remove("b%i" % i)
+
+    assert eighty_buckets == hundred_buckets
+
+
+def test_rendezvous_redistribution():
+    # check that approximately a third of keys move to the new bucket
+    # when one is added
+    n_keys = 3000
+
+    # counnt how many keys were moved and which bucket were they moved from
+    n_moved = 0
+    from_b1 = 0
+    from_b2 = 0
+
+    for i in range(n_keys):
+        key = f"key-{i}"
+        two_buckets = utils.rendezvous_rank(["b1", "b2"], key)
+        three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key)
+
+        if two_buckets[0] != three_buckets[0]:
+            n_moved += 1
+            if two_buckets[0] == "b1":
+                from_b1 += 1
+            if two_buckets[0] == "b2":
+                from_b2 += 1
+            # should always move to the newly added bucket
+            assert three_buckets[0] == "b3"
+
+    # because of statistical fluctuations we have to leave some room when
+    # making this comparison
+    assert 0.31 < n_moved / n_keys < 0.35
+    # keys should move from the two original buckets with approximately
+    # equal probability
+    assert abs(from_b1 - from_b2) < 10

From e1567d1701aa7aadd95d0495ba47e9f38dad3ad5 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Wed, 25 Sep 2019 08:22:31 +0200
Subject: [PATCH 09/10] Check that we start with roughly equal bucket
 assignments

---
 binderhub/tests/test_utils.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py
index e45659823..108397e9b 100644
--- a/binderhub/tests/test_utils.py
+++ b/binderhub/tests/test_utils.py
@@ -29,22 +29,22 @@ def test_rendezvous_redistribution():
     # when one is added
     n_keys = 3000
 
-    # counnt how many keys were moved and which bucket were they moved from
+    # count how many keys were moved, which bucket a key started from and
+    # which bucket a key was moved from (to the new bucket)
     n_moved = 0
-    from_b1 = 0
-    from_b2 = 0
+    from_bucket = {"b1": 0, "b2": 0}
+    start_in = {"b1": 0, "b2": 0}
 
     for i in range(n_keys):
         key = f"key-{i}"
         two_buckets = utils.rendezvous_rank(["b1", "b2"], key)
+        start_in[two_buckets[0]] += 1
         three_buckets = utils.rendezvous_rank(["b1", "b2", "b3"], key)
 
         if two_buckets[0] != three_buckets[0]:
             n_moved += 1
-            if two_buckets[0] == "b1":
-                from_b1 += 1
-            if two_buckets[0] == "b2":
-                from_b2 += 1
+            from_bucket[two_buckets[0]] += 1
+
             # should always move to the newly added bucket
             assert three_buckets[0] == "b3"
 
@@ -53,4 +53,8 @@ def test_rendezvous_redistribution():
     assert 0.31 < n_moved / n_keys < 0.35
     # keys should move from the two original buckets with approximately
     # equal probability
-    assert abs(from_b1 - from_b2) < 10
+    assert abs(from_bucket["b1"] - from_bucket["b2"]) < 10
+
+    # use the standard deviation of the expected number of entries in each
+    # bucket to get an idea for a reasonable (order of magnitude) difference
+    assert abs(start_in["b1"] - start_in["b2"]) < (n_keys/2)**0.5

From 34c9eeeb2113b13ba9905faf96e2e20805e56d55 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Wed, 25 Sep 2019 09:51:28 +0200
Subject: [PATCH 10/10] Extend rendezvous hash testing

---
 binderhub/tests/test_utils.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/binderhub/tests/test_utils.py b/binderhub/tests/test_utils.py
index 108397e9b..fc681fba2 100644
--- a/binderhub/tests/test_utils.py
+++ b/binderhub/tests/test_utils.py
@@ -52,9 +52,8 @@ def test_rendezvous_redistribution():
     # making this comparison
     assert 0.31 < n_moved / n_keys < 0.35
     # keys should move from the two original buckets with approximately
-    # equal probability
-    assert abs(from_bucket["b1"] - from_bucket["b2"]) < 10
-
-    # use the standard deviation of the expected number of entries in each
-    # bucket to get an idea for a reasonable (order of magnitude) difference
-    assert abs(start_in["b1"] - start_in["b2"]) < (n_keys/2)**0.5
+    # equal probability. We pick 30 because it is "about right"
+    assert abs(from_bucket["b1"] - from_bucket["b2"]) < 30
+    # the initial distribution of keys should be roughly the same
+    # We pick 30 because it is "about right"
+    assert abs(start_in["b1"] - start_in["b2"]) < 30