From 1ca137a228433f3deaa2947be34dcb6d239c38db Mon Sep 17 00:00:00 2001 From: Alejandro Pedraza Date: Tue, 5 Dec 2023 10:06:57 -0500 Subject: [PATCH] Add reinitialize-pods controller to linkerd-cni DaemonSet Followup to linkerd/linkerd2-proxy-init#306 Fixes linkerd/linkerd2#11073 This adds the `reinitialize-pods` container to the `linkerd-cni` DaemonSet, along with its config in `values.yaml`. Also the `linkerd-cni`'s version is bumped, to contain the new binary for this controller. ## TO-DOs - Integration test --- charts/linkerd2-cni/templates/cni-plugin.yaml | 51 +++++++++++++++++++ charts/linkerd2-cni/values.yaml | 40 ++++++++++++++- .../install-cni-plugin_default.golden | 49 ++++++++++++++++++ ...install-cni-plugin_fully_configured.golden | 47 +++++++++++++++++ ...-plugin_fully_configured_equal_dsts.golden | 47 +++++++++++++++++ ...lugin_fully_configured_no_namespace.golden | 47 +++++++++++++++++ .../install-cni-plugin_skip_ports.golden | 47 +++++++++++++++++ .../install_cni_helm_default_output.golden | 49 +++++++++++++++++- .../install_cni_helm_override_output.golden | 47 +++++++++++++++++ pkg/charts/cni/values.go | 10 ++++ 10 files changed, 431 insertions(+), 3 deletions(-) diff --git a/charts/linkerd2-cni/templates/cni-plugin.yaml b/charts/linkerd2-cni/templates/cni-plugin.yaml index 160449ee36620..62e0833d30763 100644 --- a/charts/linkerd2-cni/templates/cni-plugin.yaml +++ b/charts/linkerd2-cni/templates/cni-plugin.yaml @@ -112,6 +112,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -274,6 +277,54 @@ spec: {{- if .Values.resources }} {{- include "partials.resources" .Values.resources | nindent 8 }} {{- end }} + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: {{ .Values.reinitializePods.image.name -}}:{{- .Values.reinitializePods.image.version }} + imagePullPolicy: {{ .Values.reinitializePods.image.pullPolicy }} + {{- if .Values.reinitializePods.enableSecurityContext }} + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - {{ .Values.reinitializePods.logFormat }} + - --log-level + - {{ .Values.reinitializePods.logLevel }} + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + {{- end }} + {{- if .Values.resources }} + {{- include "partials.resources" .Values.resources | nindent 8 }} + {{- end }} + priorityClassName: system-cluster-critical volumes: {{- if ne .Values.destCNIBinDir .Values.destCNINetDir }} - name: cni-bin-dir diff --git a/charts/linkerd2-cni/values.yaml b/charts/linkerd2-cni/values.yaml index 977763edbb9a1..87340938945b3 100644 --- a/charts/linkerd2-cni/values.yaml +++ b/charts/linkerd2-cni/values.yaml @@ -53,7 +53,7 @@ image: # -- Docker image for the CNI plugin name: "cr.l5d.io/linkerd/cni-plugin" # -- Tag for the CNI container Docker image - version: "v1.2.2" + version: "v1.3.0" # -- Pull policy for the linkerd-cni container pullPolicy: IfNotPresent @@ -86,7 +86,7 @@ extraInitContainers: [] # - mountPath: /host/etc/cni/net.d # name: cni-net-dir -# -- Resource requests and limits for linkerd-cni daemonset containers +# -- Resource requests and limits for linkerd-cni daemonset container resources: cpu: # -- Maximum amount of CPU units that the cni container can use @@ -103,3 +103,39 @@ resources: limit: "" # -- Amount of ephemeral storage that the cni container requests request: "" + +reinitializePods: + image: + # -- Docker image for the reinitialize-pods container + name: "cr.l5d.io/linkerd/cni-plugin" + # -- Tag for the reinitialize-pods container Docker image + version: "v1.3.0" + # -- Pull policy for the reinitialize-pods container + pullPolicy: IfNotPresent + + # -- Log level for the reinitialize-pods container + # @default -- info + logLevel: info + # -- Log format (`plain` or `json`) for the reinitialize-pods container + # @default -- plain + logFormat: plain + + # -- Include a securityContext in the reinitialize-pods container + enableSecurityContext: true + + resources: + cpu: + # -- Maximum amount of CPU units that the reinitialize-pods container can use + limit: "" + # -- Amount of CPU units that the reinitialize-pods container requests + request: "" + memory: + # -- Maximum amount of memory that the reinitialize-pods container can use + limit: "" + # -- Amount of memory that the reinitialize-pods container requests + request: "" + ephemeral-storage: + # -- Maximum amount of ephemeral storage that the reinitialize-pods container can use + limit: "" + # -- Amount of ephemeral storage that the reinitialize-pods container requests + request: "" diff --git a/cli/cmd/testdata/install-cni-plugin_default.golden b/cli/cmd/testdata/install-cni-plugin_default.golden index 1fb551be483ed..2f52c30758d26 100644 --- a/cli/cmd/testdata/install-cni-plugin_default.golden +++ b/cli/cmd/testdata/install-cni-plugin_default.golden @@ -25,6 +25,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -160,6 +163,52 @@ spec: resources: limits: cpu: "1m" + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 + imagePullPolicy: IfNotPresent + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - plain + - --log-level + - info + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + resources: + limits: + cpu: "1m" + priorityClassName: system-cluster-critical volumes: - name: cni-bin-dir hostPath: diff --git a/cli/cmd/testdata/install-cni-plugin_fully_configured.golden b/cli/cmd/testdata/install-cni-plugin_fully_configured.golden index b31c00286a65d..20bedd8388423 100644 --- a/cli/cmd/testdata/install-cni-plugin_fully_configured.golden +++ b/cli/cmd/testdata/install-cni-plugin_fully_configured.golden @@ -25,6 +25,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -159,6 +162,50 @@ spec: readOnlyRootFilesystem: true privileged: false resources: + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 + imagePullPolicy: IfNotPresent + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - plain + - --log-level + - info + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + resources: + priorityClassName: system-cluster-critical volumes: - name: cni-bin-dir hostPath: diff --git a/cli/cmd/testdata/install-cni-plugin_fully_configured_equal_dsts.golden b/cli/cmd/testdata/install-cni-plugin_fully_configured_equal_dsts.golden index 2b547098b2f3b..54328acb1c5e0 100644 --- a/cli/cmd/testdata/install-cni-plugin_fully_configured_equal_dsts.golden +++ b/cli/cmd/testdata/install-cni-plugin_fully_configured_equal_dsts.golden @@ -25,6 +25,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -157,6 +160,50 @@ spec: readOnlyRootFilesystem: true privileged: false resources: + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 + imagePullPolicy: IfNotPresent + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - plain + - --log-level + - info + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + resources: + priorityClassName: system-cluster-critical volumes: - name: cni-net-dir hostPath: diff --git a/cli/cmd/testdata/install-cni-plugin_fully_configured_no_namespace.golden b/cli/cmd/testdata/install-cni-plugin_fully_configured_no_namespace.golden index b31c00286a65d..20bedd8388423 100644 --- a/cli/cmd/testdata/install-cni-plugin_fully_configured_no_namespace.golden +++ b/cli/cmd/testdata/install-cni-plugin_fully_configured_no_namespace.golden @@ -25,6 +25,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -159,6 +162,50 @@ spec: readOnlyRootFilesystem: true privileged: false resources: + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 + imagePullPolicy: IfNotPresent + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - plain + - --log-level + - info + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + resources: + priorityClassName: system-cluster-critical volumes: - name: cni-bin-dir hostPath: diff --git a/cli/cmd/testdata/install-cni-plugin_skip_ports.golden b/cli/cmd/testdata/install-cni-plugin_skip_ports.golden index 196296afc5c48..25d57f54ffe9d 100644 --- a/cli/cmd/testdata/install-cni-plugin_skip_ports.golden +++ b/cli/cmd/testdata/install-cni-plugin_skip_ports.golden @@ -25,6 +25,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -159,6 +162,50 @@ spec: readOnlyRootFilesystem: true privileged: false resources: + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 + imagePullPolicy: IfNotPresent + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - plain + - --log-level + - info + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + resources: + priorityClassName: system-cluster-critical volumes: - name: cni-bin-dir hostPath: diff --git a/cli/cmd/testdata/install_cni_helm_default_output.golden b/cli/cmd/testdata/install_cni_helm_default_output.golden index 6bd305f0668af..f07cddd234eed 100644 --- a/cli/cmd/testdata/install_cni_helm_default_output.golden +++ b/cli/cmd/testdata/install_cni_helm_default_output.golden @@ -18,6 +18,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -111,7 +114,7 @@ spec: # script copies the files into place and then sleeps so # that Kubernetes doesn't keep trying to restart it. - name: install-cni - image: cr.l5d.io/linkerd/cni-plugin:v1.2.2 + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 imagePullPolicy: IfNotPresent env: - name: DEST_CNI_NET_DIR @@ -151,6 +154,50 @@ spec: readOnlyRootFilesystem: true privileged: false resources: + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 + imagePullPolicy: IfNotPresent + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - plain + - --log-level + - info + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + resources: + priorityClassName: system-cluster-critical volumes: - name: cni-bin-dir hostPath: diff --git a/cli/cmd/testdata/install_cni_helm_override_output.golden b/cli/cmd/testdata/install_cni_helm_override_output.golden index 7d99a13d1bbd5..730b02d2cf8a4 100644 --- a/cli/cmd/testdata/install_cni_helm_override_output.golden +++ b/cli/cmd/testdata/install_cni_helm_override_output.golden @@ -18,6 +18,9 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -152,6 +155,50 @@ spec: readOnlyRootFilesystem: true privileged: false resources: + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and evicts those pods so they can try acquiring a + # proper network config again + - name: reinitialize-pods + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 + imagePullPolicy: IfNotPresent + env: + - name: LINKERD_REINITIALIZE_PODS_POD_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/lib/linkerd/linkerd-reinitialize-pods + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - plain + - --log-level + - info + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + resources: + priorityClassName: system-cluster-critical volumes: - name: cni-bin-dir hostPath: diff --git a/pkg/charts/cni/values.go b/pkg/charts/cni/values.go index 6c04d776404e3..fe3a9203202fc 100644 --- a/pkg/charts/cni/values.go +++ b/pkg/charts/cni/values.go @@ -35,6 +35,15 @@ type Resources struct { EphemeralStorage Constraints `json:"ephemeral-storage"` } +// ReinitializePods contains the config for the reinitialize-pods container +type ReinitializePods struct { + Image Image `json:"image"` + LogLevel string `json:"logLevel"` + LogFormat string `json:"logFormat"` + EnableSecurityContext bool `json:"enableSecurityContext"` + Resources Resources `json:"resources"` +} + // Values contains the top-level elements in the cni Helm chart type Values struct { InboundProxyPort uint `json:"inboundProxyPort"` @@ -60,6 +69,7 @@ type Values struct { EnablePSP bool `json:"enablePSP"` Privileged bool `json:"privileged"` Resources Resources `json:"resources"` + ReinitializePods ReinitializePods `json:"reinitializePods"` } // NewValues returns a new instance of the Values type.