diff --git a/incubator/hnc/pkg/testutils/testutils.go b/incubator/hnc/pkg/testutils/testutils.go index af33ee475..1be8c4e43 100644 --- a/incubator/hnc/pkg/testutils/testutils.go +++ b/incubator/hnc/pkg/testutils/testutils.go @@ -203,7 +203,8 @@ func RunCommand(cmdln ...string) (string, error) { args = append(args, strings.Split(subcmdln, " ")...) } } - GinkgoT().Log("Running: ", args) + prefix := fmt.Sprintf("[%d] Running: ", time.Now().Unix()) + GinkgoT().Log(prefix, args) cmd := exec.Command(args[0], args[1:]...) stdout, err := cmd.CombinedOutput() return string(stdout), err @@ -247,6 +248,11 @@ func CheckHNCPath() { } func RecoverHNC() { + // HNC can take a long time (>30s) to recover in some cases if various parts of its deployment are + // deleted, such as the validating webhook configuration or the CRDs. It appears that deleting the + // deployment before reapplying the manifests seems to allow HNC to start operating again much + // faster. + TryRun("kubectl delete deployment --all -n hnc-system") err := TryRun("kubectl apply -f", hncRecoverPath) if err != nil { GinkgoT().Log("-----------------------------WARNING------------------------------") @@ -257,13 +263,34 @@ func RecoverHNC() { } // give HNC enough time to repair time.Sleep(5 * time.Second) - // Verify and wait till HNC is fully repaired, sometimes it takes up to 30s. - // The `kubectl hns create` command will fail is HNC is broken, so we confirm that HNC is back by successfully - // running this command. - CleanupNamespaces("a", "b") - MustRun("kubectl create ns a") - RunShouldContain("Successfully created", 30, "kubectl hns create b -n a") - CleanupNamespaces("a", "b") + // Verify and wait till HNC is fully repaired, sometimes it takes up to 30s. We try to create a + // subnamespace and wait for it to be created to show that both the validators and reconcilers are + // up and running. + const ( + a = "recover-test-a" + b = "recover-test-b" + ) + // Do NOT use CleanupNamespaces because that just assumes that if it can't delete a namespace that + // everthing's fine, but this is a poor assumption if HNC has just been repaired. + // + // TODO: if CleanupNamespaces ever starts using labels to select namespaces to delete, then get + // rid of this hack. + if err := TryRunQuietly("kubectl get ns", a); err == nil { + MustRunWithTimeout(30, "kubectl hns set", a, "-a") + MustRunWithTimeout(30, "kubectl delete ns", a) + } + if err := TryRunQuietly("kubectl get ns", b); err == nil { + MustRunWithTimeout(30, "kubectl annotate ns", b, "hnc.x-k8s.io/subnamespaceOf-") + MustRunWithTimeout(30, "kubectl delete ns", b) + } + // Ensure validators work + MustRunWithTimeout(30, "kubectl create ns", a) + // Ensure reconcilers work + MustRunWithTimeout(30, "kubectl hns create", b, "-n", a) + MustRunWithTimeout(30, "kubectl get ns", b) + // At this point we can assume that HNC is working sufficiently for the regular CleanupNamespaces + // to work. + CleanupNamespaces(a, b) } func WriteTempFile(cxt string) string { diff --git a/incubator/hnc/test/e2e/rolebinding_test.go b/incubator/hnc/test/e2e/rolebinding_test.go index 4ad9a39b8..b868bc6c2 100644 --- a/incubator/hnc/test/e2e/rolebinding_test.go +++ b/incubator/hnc/test/e2e/rolebinding_test.go @@ -1,6 +1,8 @@ package e2e import ( + "time" + . "github.com/onsi/ginkgo" . "sigs.k8s.io/multi-tenancy/incubator/hnc/pkg/testutils" ) @@ -31,14 +33,22 @@ var _ = Describe("HNC should delete and create a new Rolebinding instead of upda MustRun("kubectl hns create", nsChild, "-n", nsParent) MustRun("kubectl create rolebinding test --clusterrole=admin --serviceaccount=default:default -n", nsParent) FieldShouldContain("rolebinding", nsChild, "test", ".roleRef.name", "admin") + + // It takes a while for the pods to actually be deleted - over 60s, in some cases (especially on + // Kind, I think). But we don't actually need to wait for the pods to be fully deleted - waiting + // a few moments seems to be fine, and then the terminated pods don't get in the way. I picked + // 5s fairly arbitrarily, but it works well. Feel free to try lower values it you like. + // - aludwin, Sep 2020 MustRun("kubectl delete deployment --all -n hnc-system") - // The pod might take up to a minite to be deleted, we force the deletion here to save time - MustRun("kubectl delete pods --all -n hnc-system --grace-period=0 --force") - RunShouldContain("No resources found", 60, "kubectl get pods -n hnc-system") + time.Sleep(5*time.Second) + + // Replace the source rolebinding MustRun("kubectl delete rolebinding test -n", nsParent) MustNotRun("kubectl describe rolebinding test -n", nsParent) MustRun("kubectl create rolebinding test --clusterrole=edit --serviceaccount=default:default -n", nsParent) FieldShouldContain("rolebinding", nsParent, "test", ".roleRef.name", "edit") + + // Restore HNC and verify that the new RB is propagated RecoverHNC() FieldShouldContain("rolebinding", nsChild, "test", ".roleRef.name", "edit") })