Skip to content

Commit

Permalink
Resolve review comments
Browse files Browse the repository at this point in the history
Notable change
1. When both exact match key and wildcard key matches a node taint key, the wildcard key
will have no effect on the node taint key. Added test to verify that
2. Refactor replace_failed_processgroups_test.go
3. Change clusterrolebinding to rolebinding
4. Coding style and various cosmetic improvement
  • Loading branch information
xumengpanda committed May 6, 2023
1 parent 625b6e9 commit 9f4ddc3
Show file tree
Hide file tree
Showing 12 changed files with 263 additions and 302 deletions.
2 changes: 1 addition & 1 deletion api/v1beta2/foundationdbcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1019,8 +1019,8 @@ type TaintReplacementOption struct {
Key *string `json:"key,omitempty"`

// The tainted key must be present for DurationInSeconds before operator replaces pods on the node with this taint.
// Negative DurationInSeconds disables operator from detecting or replacing the taint Key
// +kubebuilder:validation:Required
// +kubebuilder:validation:Minimum=0
DurationInSeconds *int64 `json:"durationInSeconds,omitempty"`
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9930,6 +9930,7 @@ spec:
properties:
durationInSeconds:
format: int64
minimum: 0
type: integer
key:
maxLength: 256
Expand Down
12 changes: 0 additions & 12 deletions config/deployment/rbac_clusterrole_binding.yaml

This file was deleted.

14 changes: 14 additions & 0 deletions config/deployment/rbac_role_binding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,17 @@ roleRef:
subjects:
- kind: ServiceAccount
name: fdb-kubernetes-operator-controller-manager
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
creationTimestamp: null
name: manager-clusterrolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: fdb-kubernetes-operator-manager-clusterrole
subjects:
- kind: ServiceAccount
name: fdb-kubernetes-operator-controller-manager

13 changes: 13 additions & 0 deletions config/samples/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,19 @@ rules:
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
creationTimestamp: null
name: fdb-kubernetes-operator-manager-clusterrolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: fdb-kubernetes-operator-manager-clusterrole
subjects:
- kind: ServiceAccount
name: fdb-kubernetes-operator-controller-manager
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
creationTimestamp: null
name: fdb-kubernetes-operator-manager-rolebinding
Expand Down
362 changes: 164 additions & 198 deletions controllers/replace_failed_process_groups_test.go

Large diffs are not rendered by default.

55 changes: 33 additions & 22 deletions controllers/update_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -655,24 +655,23 @@ func validateProcessGroup(ctx context.Context, r *FoundationDBClusterReconciler,
// updateTaintCondition checks pod's node taint label and update pod's taint-related condition accordingly
func updateTaintCondition(ctx context.Context, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster,
pod *corev1.Pod, processGroupStatus *fdbv1beta2.ProcessGroupStatus, nodeMap map[string]*corev1.Node, logger logr.Logger) error {
//node := corev1.Node{}
node, ok := nodeMap[pod.Spec.NodeName]
log.Info("Get pod's node Start", "Pod", pod.Name, "Pod's node name", pod.Spec.NodeName, "node map size", len(nodeMap))
log.V(4).Info("Get pod's node", "Pod", pod.Name, "Pod's node name", pod.Spec.NodeName, "node map size", len(nodeMap))
if !ok {
node = &corev1.Node{}
err := r.Get(ctx, client.ObjectKey{Name: pod.Spec.NodeName}, node)
if err != nil {
log.Info("Get pod's node fails", "Pod", pod.Name, "Pod's node name", pod.Spec.NodeName, "err", err)
return err
return fmt.Errorf("get pod %s node %s fails with error :%w", pod.Name, pod.Spec.NodeName, err)
}
nodeMap[pod.Spec.NodeName] = node
log.Info("Set nodeMap cache", "Pod", pod.Name, "Pod's node name", pod.Spec.NodeName, "Node", node)
log.V(4).Info("Set nodeMap cache", "Pod", pod.Name, "Pod's node name", pod.Spec.NodeName, "Node", node)
}

// Check the tainted duration and only mark the process group tainted after the configured tainted duration
// TODO: https://github.com/FoundationDB/fdb-kubernetes-operator/issues/1583
hasMatchingTaint := false
for _, taint := range node.Spec.Taints {
hasExactMatch := hasExactMatchedTaintKey(cluster.Spec.AutomationOptions.Replacements.TaintReplacementOptions, taint.Key)
for _, taintConfiguredKey := range cluster.Spec.AutomationOptions.Replacements.TaintReplacementOptions {
taintConfiguredKeyString := pointer.StringDeref(taintConfiguredKey.Key, "")
if taintConfiguredKeyString == "" || pointer.Int64Deref(taintConfiguredKey.DurationInSeconds, math.MinInt64) < 0 {
Expand All @@ -681,28 +680,31 @@ func updateTaintCondition(ctx context.Context, r *FoundationDBClusterReconciler,
"Pod", pod.Name)
continue
}
if taint.Key == "" || taint.TimeAdded.IsZero() {
if taint.Key == "" {
// Node's taint is not properly set, skip the node's taint
logger.Info("Taint is not properly set", "Node", node.Name, "Taint", taint)
continue
}
if (hasExactMatch && taint.Key != taintConfiguredKeyString) ||
(!hasExactMatch && taintConfiguredKeyString != "*") {
continue
}

if taint.Key == taintConfiguredKeyString || taintConfiguredKeyString == "*" {
hasMatchingTaint = true
processGroupStatus.UpdateCondition(fdbv1beta2.NodeTaintDetected, true, cluster.Status.ProcessGroups, processGroupStatus.ProcessGroupID)
// Use node taint's timestamp as the NodeTaintDetected condition's starting time
if taint.TimeAdded.Time.Unix() < *processGroupStatus.GetConditionTime(fdbv1beta2.NodeTaintDetected) {
processGroupStatus.UpdateConditionTime(fdbv1beta2.NodeTaintDetected, taint.TimeAdded.Unix())
}
taintDetectedTime := pointer.Int64Deref(processGroupStatus.GetConditionTime(fdbv1beta2.NodeTaintDetected), math.MaxInt64)
if time.Now().Unix()-taintDetectedTime > pointer.Int64Deref(taintConfiguredKey.DurationInSeconds, math.MaxInt64) {
processGroupStatus.UpdateCondition(fdbv1beta2.NodeTaintReplacing, true, cluster.Status.ProcessGroups, processGroupStatus.ProcessGroupID)
logger.Info("Add NodeTaintReplacing condition", "Pod", pod.Name, "Node", node.Name,
"TaintKey", taint.Key, "TaintDetectedTime", taintDetectedTime,
"TaintDuration", int64(time.Since(time.Unix(taintDetectedTime, 0))),
"TaintValue", taint.Value, "TaintEffect", taint.Effect,
"ClusterTaintDetectionDuration", time.Duration(pointer.Int64Deref(taintConfiguredKey.DurationInSeconds, math.MaxInt64)))
}
hasMatchingTaint = true
processGroupStatus.UpdateCondition(fdbv1beta2.NodeTaintDetected, true, cluster.Status.ProcessGroups, processGroupStatus.ProcessGroupID)
// Use node taint's timestamp as the NodeTaintDetected condition's starting time
if !taint.TimeAdded.IsZero() && taint.TimeAdded.Time.Unix() < *processGroupStatus.GetConditionTime(fdbv1beta2.NodeTaintDetected) {
processGroupStatus.UpdateConditionTime(fdbv1beta2.NodeTaintDetected, taint.TimeAdded.Unix())
}
taintDetectedTime := pointer.Int64Deref(processGroupStatus.GetConditionTime(fdbv1beta2.NodeTaintDetected), math.MaxInt64)
// If a wildcard is specified as key, it will be used as default value, except there is an exact match defined for the taint key.
if time.Now().Unix()-taintDetectedTime > pointer.Int64Deref(taintConfiguredKey.DurationInSeconds, math.MaxInt64) {
processGroupStatus.UpdateCondition(fdbv1beta2.NodeTaintReplacing, true, cluster.Status.ProcessGroups, processGroupStatus.ProcessGroupID)
logger.Info("Add NodeTaintReplacing condition", "Pod", pod.Name, "Node", node.Name,
"TaintKey", taint.Key, "TaintDetectedTime", taintDetectedTime,
"TaintDuration", int64(time.Since(time.Unix(taintDetectedTime, 0))),
"TaintValue", taint.Value, "TaintEffect", taint.Effect,
"ClusterTaintDetectionDuration", time.Duration(pointer.Int64Deref(taintConfiguredKey.DurationInSeconds, math.MaxInt64)))
}
}
}
Expand Down Expand Up @@ -839,3 +841,12 @@ func getRunningVersion(versionMap map[string]int, fallback string) (string, erro

return currentCandidate.String(), nil
}

func hasExactMatchedTaintKey(taintReplacementOptions []fdbv1beta2.TaintReplacementOption, nodeTaintKey string) bool {
for _, configuredTaintKey := range taintReplacementOptions {
if *configuredTaintKey.Key == nodeTaintKey {
return true
}
}
return false
}
3 changes: 1 addition & 2 deletions controllers/update_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ var _ = Describe("update_status", func() {

It("should disable taint feature", func() {
Expect(len(processGroupStatus.ProcessGroupConditions)).To(Equal(0))
Expect(err).NotTo(HaveOccurred())
})
})

Expand Down Expand Up @@ -455,7 +454,7 @@ var _ = Describe("update_status", func() {
})

It("should get a condition assigned", func() {
processGroupStatus, err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPods, allPvcs)
processGroupStatus, err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPods, allPvcs, logger)
Expect(err).NotTo(HaveOccurred())

incorrectProcesses := fdbv1beta2.FilterByCondition(processGroupStatus, fdbv1beta2.IncorrectCommandLine, false)
Expand Down
2 changes: 1 addition & 1 deletion docs/cluster_spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ TaintReplacementOption defines the taint key and taint duration the operator wil
| Field | Description | Scheme | Required |
| ----- | ----------- | ------ | -------- |
| key | Tainted key | *string | false |
| durationInSeconds | The tainted key must be present for DurationInSeconds before operator replaces pods on the node with this taint. Negative DurationInSeconds disables operator from detecting or replacing the taint Key | *int64 | false |
| durationInSeconds | The tainted key must be present for DurationInSeconds before operator replaces pods on the node with this taint. | *int64 | false |

[Back to TOC](#table-of-contents)

Expand Down
2 changes: 1 addition & 1 deletion e2e/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ TIMEOUT?=168h
NAMESPACE?=
CONTEXT?=
FDB_VERSION?=7.1.31
PREVIOUS_FDB_VERSION?=6.3.25
PREVIOUS_FDB_VERSION?=6.3.24
## Expectation is that you are running standard build image which generates both regular and debug (Symbols) images.
FDB_IMAGE?=foundationdb/foundationdb:$(FDB_VERSION)
SIDECAR_IMAGE?=foundationdb/foundationdb-kubernetes-sidecar:$(FDB_VERSION)-1
Expand Down
16 changes: 4 additions & 12 deletions e2e/fixtures/fdb_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ func (fdbCluster *FdbCluster) UpdateClusterSpecWithSpec(desiredSpec *fdbv1beta2.

specUpdated := equality.Semantic.DeepEqual(fetchedCluster.Spec, *desiredSpec)
log.Println("UpdateClusterSpec: specUpdated:", specUpdated)
fmt.Printf("FetchedSpec %+v\n", fetchedCluster.Spec)
fmt.Printf("DesiredSpec %+v\n", desiredSpec)
if specUpdated {
return true
}
Expand All @@ -405,7 +407,7 @@ func (fdbCluster *FdbCluster) UpdateClusterSpecWithSpec(desiredSpec *fdbv1beta2.
}
// Retry here and let the method fetch the latest version of the cluster again until the spec is updated.
return false
}).WithTimeout(5 * time.Minute).WithPolling(1 * time.Second).Should(gomega.BeTrue())
}).WithTimeout(10 * time.Minute).WithPolling(1 * time.Second).Should(gomega.BeTrue())

fdbCluster.cluster = fetchedCluster
}
Expand Down Expand Up @@ -1044,17 +1046,7 @@ func (fdbCluster *FdbCluster) CheckPodIsDeleted(podName string) bool {
// It times out after timeoutMinutes.
func (fdbCluster *FdbCluster) EnsurePodIsDeletedWithCustomTimeout(podName string, timeoutMinutes int) {
gomega.Eventually(func() bool {
pod := &corev1.Pod{}
err := fdbCluster.getClient().
Get(ctx.TODO(), client.ObjectKey{Namespace: fdbCluster.Namespace(), Name: podName}, pod)

log.Println("error: ", err, "pod", pod.ObjectMeta)
if err != nil {
return kubeErrors.IsNotFound(err)
}

// For our case it's enough to validate that the Pod is marked for deletion.
return !pod.DeletionTimestamp.IsZero()
return fdbCluster.CheckPodIsDeleted(podName)
}).WithTimeout(time.Duration(timeoutMinutes) * time.Minute).WithPolling(1 * time.Second).Should(gomega.BeTrue())
}

Expand Down
Loading

0 comments on commit 9f4ddc3

Please sign in to comment.