From 48ec4123c0d60e49b6ef0d02b4ff3b6798f5e868 Mon Sep 17 00:00:00 2001
From: Michael Schurter <mschurter@hashicorp.com>
Date: Tue, 24 Nov 2020 09:14:00 -0800
Subject: [PATCH 1/3] nomad: try to avoid slice resizing when batching

---
 nomad/node_endpoint.go | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index f795d1bd2c3a..36a18a26f640 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -1144,8 +1144,13 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 			updates := n.updates
 			evals := n.evals
 			future := n.updateFuture
-			n.updates = nil
-			n.evals = nil
+
+			// Assume future update patterns will be similar to
+			// current batch and set cap appropriately to avoid
+			// slice resizing.
+			n.updates = make([]*structs.Allocation, 0, len(updates))
+			n.evals = make([]*structs.Evaluation, 0, len(evals))
+
 			n.updateFuture = nil
 			n.updateTimer = nil
 			n.updatesLock.Unlock()

From e6fd2583faebf28754b18dfd49f7e213a1d2a593 Mon Sep 17 00:00:00 2001
From: Michael Schurter <mschurter@hashicorp.com>
Date: Tue, 24 Nov 2020 09:14:27 -0800
Subject: [PATCH 2/3] client: always wait 200ms before sending updates

Always wait 200ms before calling the Node.UpdateAlloc RPC to send
allocation updates to servers.

Prior to this change we only reset the update ticker when an error was
encountered. This meant the 200ms ticker was running while the RPC was
being performed. If the RPC was slow due to network latency or server
load and took >=200ms, the ticker would tick during the RPC.

Then on the next loop only the select would randomly choose between the
two viable cases: receive an update or fire the RPC again.

If the RPC case won it would immediately loop again due to there being
no updates to send.

When the update chan receive is selected a single update is added to the
slice. The odds are then 50/50 that the subsequent loop will send the
single update instead of receiving any more updates.

This could cause a couple of problems:

1. Since only a small number of updates are sent, the chan buffer may
   fill, applying backpressure, and slowing down other client
   operations.
2. The small number of updates sent may already be stale and not
   represent the current state of the allocation locally.

A risk here is that it's hard to reason about how this will interact
with the 50ms batches on servers when the servers under load.

A further improvement would be to completely remove the alloc update
chan and instead use a mutex to build a map of alloc updates. I wanted
to test the lowest risk possible change on loaded servers first before
making more drastic changes.
---
 client/client.go | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/client/client.go b/client/client.go
index 9a1e8e1d04e3..2674e02dd657 100644
--- a/client/client.go
+++ b/client/client.go
@@ -1904,7 +1904,6 @@ func (c *Client) AllocStateUpdated(alloc *structs.Allocation) {
 // allocSync is a long lived function that batches allocation updates to the
 // server.
 func (c *Client) allocSync() {
-	staggered := false
 	syncTicker := time.NewTicker(allocSyncIntv)
 	updates := make(map[string]*structs.Allocation)
 	for {
@@ -1933,19 +1932,24 @@ func (c *Client) allocSync() {
 			}
 
 			var resp structs.GenericResponse
-			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
+			err := c.RPC("Node.UpdateAlloc", &args, &resp)
+			if err != nil {
+				// Error updating allocations, do *not* clear
+				// updates and retry after backoff
 				c.logger.Error("error updating allocations", "error", err)
 				syncTicker.Stop()
 				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
-				staggered = true
-			} else {
-				updates = make(map[string]*structs.Allocation)
-				if staggered {
-					syncTicker.Stop()
-					syncTicker = time.NewTicker(allocSyncIntv)
-					staggered = false
-				}
+				continue
 			}
+
+			// Successfully updated allocs, reset map and ticker.
+			// Always reset ticker to give loop time to receive
+			// alloc updates. If the RPC took the ticker interval
+			// we may call it in a tight loop before draining
+			// buffered updates.
+			updates = make(map[string]*structs.Allocation, len(updates))
+			syncTicker.Stop()
+			syncTicker = time.NewTicker(allocSyncIntv)
 		}
 	}
 }

From 26127c088ce5a4d1ad0de176b3db545bccc65bd4 Mon Sep 17 00:00:00 2001
From: Michael Schurter <mschurter@hashicorp.com>
Date: Mon, 30 Nov 2020 10:27:13 -0800
Subject: [PATCH 3/3] docs: add #9435 to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d4160dc7a58a..2e9df3fc7265 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ IMPROVEMENTS:
  * client: Use ec2 CPU perf data from AWS API [[GH-7830](https://github.com/hashicorp/nomad/issues/7830)]
  * client: Added support for Azure fingerprinting. [[GH-8979](https://github.com/hashicorp/nomad/issues/8979)]
  * client: Batch state store writes to reduce disk IO. [[GH-9093](https://github.com/hashicorp/nomad/issues/9093)]
+ * client: Reduce rate of sending allocation updates when servers are slow. [[GH-9435](https://github.com/hashicorp/nomad/issues/9435)]
  * client: Added support for fingerprinting the client node's Consul segment. [[GH-7214](https://github.com/hashicorp/nomad/issues/7214)]
  * client: Added `NOMAD_JOB_ID` and `NOMAD_PARENT_JOB_ID` environment variables to those made available to jobs. [[GH-8967](https://github.com/hashicorp/nomad/issues/8967)]
  * client: Updated consul-template to v0.25.0 - config `function_blacklist` deprecated and replaced with `function_denylist` [[GH-8988](https://github.com/hashicorp/nomad/pull/8988)]