From 6665340e7956fccc73e98212447c6f91301174ec Mon Sep 17 00:00:00 2001
From: Adrian Ludwin <aludwin@google.com>
Date: Wed, 30 Sep 2020 01:39:10 -0400
Subject: [PATCH] Make e2e test repairs more robust

Made several debuggability and functional improvements:

* Added timestamps to all output to correlate with logs. Timestamps are
  of the form "seconds since Unix epoch" which isn't human friendly but
  is identical to the timestamps produced by HNC logs.
* Fully deleted the HNC deployment in RecoverHNC since HNC doesn't
  really seem to recover well if various things are changed without
  restarting the pod.
* Make the post-recovery test far more robust by not ignoring failures
  to delete the test namespaces.
* Stop force-deleting pods in the rolebinding test and instead just wait
  a few moments. I found that we don't actually need to wait for the pod
  object to be fully deleted on the server for it to stop getting in the
  way; the container in the pod appears to stop running ~instantly while
  the pod can occasionally hang around for over 60s.

All of these were added as I saw failures in the affected code.

Tested: Ran a set of five flaky tests with and without these changes
(while also including PR #1163). Without, at least one of them failed
virtually every time; with this change, they passed 5/5 times on GKE
plus 2/2 times on Kind.
---
 incubator/hnc/pkg/testutils/testutils.go   | 43 ++++++++++++++++++----
 incubator/hnc/test/e2e/rolebinding_test.go | 16 ++++++--
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/incubator/hnc/pkg/testutils/testutils.go b/incubator/hnc/pkg/testutils/testutils.go
index af33ee475..1be8c4e43 100644
--- a/incubator/hnc/pkg/testutils/testutils.go
+++ b/incubator/hnc/pkg/testutils/testutils.go
@@ -203,7 +203,8 @@ func RunCommand(cmdln ...string) (string, error) {
 			args = append(args, strings.Split(subcmdln, " ")...)
 		}
 	}
-	GinkgoT().Log("Running: ", args)
+	prefix := fmt.Sprintf("[%d] Running: ", time.Now().Unix())
+	GinkgoT().Log(prefix, args)
 	cmd := exec.Command(args[0], args[1:]...)
 	stdout, err := cmd.CombinedOutput()
 	return string(stdout), err
@@ -247,6 +248,11 @@ func CheckHNCPath() {
 }
 
 func RecoverHNC() {
+	// HNC can take a long time (>30s) to recover in some cases if various parts of its deployment are
+	// deleted, such as the validating webhook configuration or the CRDs. It appears that deleting the
+	// deployment before reapplying the manifests seems to allow HNC to start operating again much
+	// faster.
+	TryRun("kubectl delete deployment --all -n hnc-system")
 	err := TryRun("kubectl apply -f", hncRecoverPath)
 	if err != nil {
 		GinkgoT().Log("-----------------------------WARNING------------------------------")
@@ -257,13 +263,34 @@ func RecoverHNC() {
 	}
 	// give HNC enough time to repair
 	time.Sleep(5 * time.Second)
-	// Verify and wait till HNC is fully repaired, sometimes it takes up to 30s.
-	// The `kubectl hns create` command will fail is HNC is broken, so we confirm that HNC is back by successfully 
-	// running this command. 
-	CleanupNamespaces("a", "b")
-	MustRun("kubectl create ns a")
-	RunShouldContain("Successfully created", 30, "kubectl hns create b -n a")
-	CleanupNamespaces("a", "b")
+	// Verify and wait till HNC is fully repaired, sometimes it takes up to 30s. We try to create a
+	// subnamespace and wait for it to be created to show that both the validators and reconcilers are
+	// up and running.
+	const (
+		a = "recover-test-a"
+		b = "recover-test-b"
+	)
+	// Do NOT use CleanupNamespaces because that just assumes that if it can't delete a namespace that
+	// everthing's fine, but this is a poor assumption if HNC has just been repaired.
+	//
+	// TODO: if CleanupNamespaces ever starts using labels to select namespaces to delete, then get
+	// rid of this hack.
+	if err := TryRunQuietly("kubectl get ns", a); err == nil {
+		MustRunWithTimeout(30, "kubectl hns set", a, "-a")
+		MustRunWithTimeout(30, "kubectl delete ns", a)
+	}
+	if err := TryRunQuietly("kubectl get ns", b); err == nil {
+		MustRunWithTimeout(30, "kubectl annotate ns", b, "hnc.x-k8s.io/subnamespaceOf-")
+		MustRunWithTimeout(30, "kubectl delete ns", b)
+	}
+	// Ensure validators work
+	MustRunWithTimeout(30, "kubectl create ns", a)
+	// Ensure reconcilers work
+	MustRunWithTimeout(30, "kubectl hns create", b, "-n", a)
+	MustRunWithTimeout(30, "kubectl get ns", b)
+	// At this point we can assume that HNC is working sufficiently for the regular CleanupNamespaces
+	// to work.
+	CleanupNamespaces(a, b)
 }
 
 func WriteTempFile(cxt string) string {
diff --git a/incubator/hnc/test/e2e/rolebinding_test.go b/incubator/hnc/test/e2e/rolebinding_test.go
index 4ad9a39b8..b868bc6c2 100644
--- a/incubator/hnc/test/e2e/rolebinding_test.go
+++ b/incubator/hnc/test/e2e/rolebinding_test.go
@@ -1,6 +1,8 @@
 package e2e
 
 import (
+	"time"
+
 	. "github.com/onsi/ginkgo"
 	. "sigs.k8s.io/multi-tenancy/incubator/hnc/pkg/testutils"
 )
@@ -31,14 +33,22 @@ var _ = Describe("HNC should delete and create a new Rolebinding instead of upda
 		MustRun("kubectl hns create", nsChild, "-n", nsParent)
 		MustRun("kubectl create rolebinding test --clusterrole=admin --serviceaccount=default:default -n", nsParent)
 		FieldShouldContain("rolebinding", nsChild, "test", ".roleRef.name", "admin")
+
+		// It takes a while for the pods to actually be deleted - over 60s, in some cases (especially on
+		// Kind, I think). But we don't actually need to wait for the pods to be fully deleted - waiting
+		// a few moments seems to be fine, and then the terminated pods don't get in the way. I picked
+		// 5s fairly arbitrarily, but it works well. Feel free to try lower values it you like.
+		//   - aludwin, Sep 2020
 		MustRun("kubectl delete deployment --all -n hnc-system")
-		// The pod might take up to a minite to be deleted, we force the deletion here to save time
-		MustRun("kubectl delete pods --all -n hnc-system --grace-period=0 --force")
-		RunShouldContain("No resources found", 60, "kubectl get pods -n hnc-system")
+		time.Sleep(5*time.Second)
+
+		// Replace the source rolebinding
 		MustRun("kubectl delete rolebinding test -n", nsParent)
 		MustNotRun("kubectl describe rolebinding test -n", nsParent)
 		MustRun("kubectl create rolebinding test --clusterrole=edit --serviceaccount=default:default -n", nsParent)
 		FieldShouldContain("rolebinding", nsParent, "test", ".roleRef.name", "edit")
+
+		// Restore HNC and verify that the new RB is propagated
 		RecoverHNC()
 		FieldShouldContain("rolebinding", nsChild, "test", ".roleRef.name", "edit")
 	})