Skip to content
This repository has been archived by the owner on Jun 26, 2023. It is now read-only.

Make e2e test repairs more robust #1164

Merged
merged 1 commit into from
Oct 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 35 additions & 8 deletions incubator/hnc/pkg/testutils/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ func RunCommand(cmdln ...string) (string, error) {
args = append(args, strings.Split(subcmdln, " ")...)
}
}
GinkgoT().Log("Running: ", args)
prefix := fmt.Sprintf("[%d] Running: ", time.Now().Unix())
GinkgoT().Log(prefix, args)
cmd := exec.Command(args[0], args[1:]...)
stdout, err := cmd.CombinedOutput()
return string(stdout), err
Expand Down Expand Up @@ -247,6 +248,11 @@ func CheckHNCPath() {
}

func RecoverHNC() {
// HNC can take a long time (>30s) to recover in some cases if various parts of its deployment are
// deleted, such as the validating webhook configuration or the CRDs. It appears that deleting the
// deployment before reapplying the manifests seems to allow HNC to start operating again much
// faster.
TryRun("kubectl delete deployment --all -n hnc-system")
err := TryRun("kubectl apply -f", hncRecoverPath)
if err != nil {
GinkgoT().Log("-----------------------------WARNING------------------------------")
Expand All @@ -257,13 +263,34 @@ func RecoverHNC() {
}
// give HNC enough time to repair
time.Sleep(5 * time.Second)
// Verify and wait till HNC is fully repaired, sometimes it takes up to 30s.
// The `kubectl hns create` command will fail is HNC is broken, so we confirm that HNC is back by successfully
// running this command.
CleanupNamespaces("a", "b")
MustRun("kubectl create ns a")
RunShouldContain("Successfully created", 30, "kubectl hns create b -n a")
CleanupNamespaces("a", "b")
// Verify and wait till HNC is fully repaired, sometimes it takes up to 30s. We try to create a
// subnamespace and wait for it to be created to show that both the validators and reconcilers are
// up and running.
const (
a = "recover-test-a"
b = "recover-test-b"
)
// Do NOT use CleanupNamespaces because that just assumes that if it can't delete a namespace that
// everthing's fine, but this is a poor assumption if HNC has just been repaired.
//
// TODO: if CleanupNamespaces ever starts using labels to select namespaces to delete, then get
// rid of this hack.
if err := TryRunQuietly("kubectl get ns", a); err == nil {
MustRunWithTimeout(30, "kubectl hns set", a, "-a")
MustRunWithTimeout(30, "kubectl delete ns", a)
}
if err := TryRunQuietly("kubectl get ns", b); err == nil {
MustRunWithTimeout(30, "kubectl annotate ns", b, "hnc.x-k8s.io/subnamespaceOf-")
MustRunWithTimeout(30, "kubectl delete ns", b)
}
Comment on lines +273 to +285
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this looked the same to me as CleanupNamespaces() except skipping de-annotating a and no setting AC for b. What makes a difference?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CleanupNamespaces is TryRunQuietly, this is MustRunWithTimeout.

// Ensure validators work
MustRunWithTimeout(30, "kubectl create ns", a)
// Ensure reconcilers work
MustRunWithTimeout(30, "kubectl hns create", b, "-n", a)
MustRunWithTimeout(30, "kubectl get ns", b)
// At this point we can assume that HNC is working sufficiently for the regular CleanupNamespaces
// to work.
CleanupNamespaces(a, b)
}

func WriteTempFile(cxt string) string {
Expand Down
16 changes: 13 additions & 3 deletions incubator/hnc/test/e2e/rolebinding_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package e2e

import (
"time"

. "github.com/onsi/ginkgo"
. "sigs.k8s.io/multi-tenancy/incubator/hnc/pkg/testutils"
)
Expand Down Expand Up @@ -31,14 +33,22 @@ var _ = Describe("HNC should delete and create a new Rolebinding instead of upda
MustRun("kubectl hns create", nsChild, "-n", nsParent)
MustRun("kubectl create rolebinding test --clusterrole=admin --serviceaccount=default:default -n", nsParent)
FieldShouldContain("rolebinding", nsChild, "test", ".roleRef.name", "admin")

// It takes a while for the pods to actually be deleted - over 60s, in some cases (especially on
// Kind, I think). But we don't actually need to wait for the pods to be fully deleted - waiting
// a few moments seems to be fine, and then the terminated pods don't get in the way. I picked
// 5s fairly arbitrarily, but it works well. Feel free to try lower values it you like.
// - aludwin, Sep 2020
MustRun("kubectl delete deployment --all -n hnc-system")
// The pod might take up to a minite to be deleted, we force the deletion here to save time
MustRun("kubectl delete pods --all -n hnc-system --grace-period=0 --force")
RunShouldContain("No resources found", 60, "kubectl get pods -n hnc-system")
time.Sleep(5*time.Second)

// Replace the source rolebinding
MustRun("kubectl delete rolebinding test -n", nsParent)
MustNotRun("kubectl describe rolebinding test -n", nsParent)
MustRun("kubectl create rolebinding test --clusterrole=edit --serviceaccount=default:default -n", nsParent)
FieldShouldContain("rolebinding", nsParent, "test", ".roleRef.name", "edit")

// Restore HNC and verify that the new RB is propagated
RecoverHNC()
FieldShouldContain("rolebinding", nsChild, "test", ".roleRef.name", "edit")
})
Expand Down