diff --git a/checks/topologySpreadConstraint.yaml b/checks/topologySpreadConstraint.yaml new file mode 100644 index 000000000..8256d42c5 --- /dev/null +++ b/checks/topologySpreadConstraint.yaml @@ -0,0 +1,21 @@ +successMessage: Pod has a valid topology spread constraint +failureMessage: Pod should be configured with a valid topology spread constraint +category: Reliability +target: PodSpec +schema: + '$schema': http://json-schema.org/draft-07/schema + type: object + required: + - topologySpreadConstraints + properties: + topologySpreadConstraints: + type: array + items: + type: object + properties: + topologyKey: + anyOf: + - type: string + const: "kubernetes.io/hostname" + - type: string + const: "topology.kubernetes.io/zone" diff --git a/docs/checks/reliability.md b/docs/checks/reliability.md index 554337deb..911ee8afd 100644 --- a/docs/checks/reliability.md +++ b/docs/checks/reliability.md @@ -17,19 +17,51 @@ key | default | description `priorityClassNotSet` | `ignore` | Fails when a priorityClassName is not set for a pod. `deploymentMissingReplicas` | `warning` | Fails when there is only one replica for a deployment. `missingPodDisruptionBudget` | `ignore` +`topologySpreadConstraint` | `warning` | Fails when there is no topology spread constraint on the pod ## Background +### Liveness and Readiness Probes Readiness and liveness probes can help maintain the health of applications running inside Kubernetes. By default, Kubernetes only knows whether or not a process is running, not if it's healthy. Properly configured readiness and liveness probes will also be able to ensure the health of an application. Readiness probes are designed to ensure that an application has reached a "ready" state. In many cases there is a period of time between when a webserver process starts and when it is ready to receive traffic. A readiness probe can ensure the traffic is not sent to a pod until it is actually ready to receive traffic. Liveness probes are designed to ensure that an application stays in a healthy state. When a liveness probe fails, the pod will be restarted. +### Image Pull Policy Docker's `latest` tag is applied by default to images where a tag hasn't been specified. Not specifying a specific version of an image can lead to a wide variety of problems. The underlying image could include unexpected breaking changes that break your application whenever the latest image is pulled. Reusing the same tag for multiple versions of an image can lead to different nodes in the same cluster having different versions of an image, even if the tag is identical. Related to that, relying on cached versions of a Docker image can become a security vulnerability. By default, an image will be pulled if it isn't already cached on the node attempting to run it. This can result in variations in images that are running per node, or potentially provide a way to gain access to an image without having direct access to the ImagePullSecret. With that in mind, it's often better to ensure the a pod has `pullPolicy: Always` specified, so images are always pulled directly from their source. +### Topology Spread Constraints + +By default, the Kubernetes scheduler uses a bin-packing algorithm to fit as many pods as possible into a cluster. The scheduler prefers a more evenly distributed general node load to app replicas precisely spread across nodes. Therefore, by default, multi-replica is not guaranteed to be spread across multiple availability zones. Kubernetes provides topologySpreadConstraint configuration in order to better ensure pod spread across multiple AZs and/or Hosts. + +Example of a topologySpreadConstraint spreading across zones: + +``` +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-basic-demo +spec: + selector: + matchLabels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + template: + metadata: + labels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: "topology.kubernetes.io/zone" + whenUnsatisfiable: ScheduleAnyway +``` + + ## Further Reading - [What's Wrong With The Docker :latest Tag?](https://vsupalov.com/docker-latest-tag/) @@ -37,3 +69,4 @@ Related to that, relying on cached versions of a Docker image can become a secur - [Kubernetes Docs: Configure Liveness and Readiness Probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/) - [Utilizing Kubernetes Liveness and Readiness Probes to Automatically Recover From Failure](https://medium.com/spire-labs/utilizing-kubernetes-liveness-and-readiness-probes-to-automatically-recover-from-failure-2fe0314f2b2e) - [Kubernetes Liveness and Readiness Probes: How to Avoid Shooting Yourself in the Foot](https://blog.colinbreck.com/kubernetes-liveness-and-readiness-probes-how-to-avoid-shooting-yourself-in-the-foot/) +- [Topology Spread Cosntraints](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/) \ No newline at end of file diff --git a/examples/config.yaml b/examples/config.yaml index ec4483835..dfb352072 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -9,6 +9,7 @@ checks: metadataAndNameMismatched: ignore pdbDisruptionsIsZero: warning missingPodDisruptionBudget: ignore + topologySpreadConstraint: warning # efficiency cpuRequestsMissing: warning diff --git a/pkg/config/checks.go b/pkg/config/checks.go index f15a2e8bf..83b4fb998 100644 --- a/pkg/config/checks.go +++ b/pkg/config/checks.go @@ -33,6 +33,7 @@ var ( "hostPIDSet", "hostNetworkSet", "automountServiceAccountToken", + "topologySpreadConstraint", // Container checks "memoryLimitsMissing", "memoryRequestsMissing", diff --git a/test/checks/topologySpreadConstraint/failure.invalidtopologykey.yaml b/test/checks/topologySpreadConstraint/failure.invalidtopologykey.yaml new file mode 100644 index 000000000..0cb09de8e --- /dev/null +++ b/test/checks/topologySpreadConstraint/failure.invalidtopologykey.yaml @@ -0,0 +1,65 @@ +# Source: basic-demo/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-basic-demo + labels: + app.kubernetes.io/name: basic-demo + helm.sh/chart: basic-demo-0.5.2 + app.kubernetes.io/instance: demo + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + template: + metadata: + labels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: farglebargle + whenUnsatisfiable: ScheduleAnyway + containers: + - name: basic-demo + image: "quay.io/fairwinds/docker-demo:latest" + imagePullPolicy: Always + env: + - name: REFRESH_INTERVAL + value: "500" + - name: TITLE + value: "Kubernetes Demo" + - name: METADATA + value: "" + ports: + - name: http + containerPort: 8080 + protocol: TCP + securityContext: + runAsUser: 1200 + allowPrivilegeEscalation: false + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: / + port: http + resources: + limits: + cpu: 1 + memory: 100Mi + requests: + cpu: 100m + memory: 100Mi + diff --git a/test/checks/topologySpreadConstraint/failure.nospreadconstraint.yaml b/test/checks/topologySpreadConstraint/failure.nospreadconstraint.yaml new file mode 100644 index 000000000..e7c96b790 --- /dev/null +++ b/test/checks/topologySpreadConstraint/failure.nospreadconstraint.yaml @@ -0,0 +1,61 @@ +# Source: basic-demo/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-basic-demo + labels: + app.kubernetes.io/name: basic-demo + helm.sh/chart: basic-demo-0.5.2 + app.kubernetes.io/instance: demo + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + template: + metadata: + labels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + spec: + containers: + - name: basic-demo + image: "quay.io/fairwinds/docker-demo:latest" + imagePullPolicy: Always + env: + - name: REFRESH_INTERVAL + value: "500" + - name: TITLE + value: "Kubernetes Demo" + - name: METADATA + value: "" + ports: + - name: http + containerPort: 8080 + protocol: TCP + securityContext: + runAsUser: 1200 + allowPrivilegeEscalation: false + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: / + port: http + resources: + limits: + cpu: 1 + memory: 100Mi + requests: + cpu: 100m + memory: 100Mi + diff --git a/test/checks/topologySpreadConstraint/success.yaml b/test/checks/topologySpreadConstraint/success.yaml new file mode 100644 index 000000000..e48ee51d7 --- /dev/null +++ b/test/checks/topologySpreadConstraint/success.yaml @@ -0,0 +1,65 @@ +# Source: basic-demo/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-basic-demo + labels: + app.kubernetes.io/name: basic-demo + helm.sh/chart: basic-demo-0.5.2 + app.kubernetes.io/instance: demo + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + template: + metadata: + labels: + app.kubernetes.io/name: basic-demo + app.kubernetes.io/instance: demo + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: "topology.kubernetes.io/zone" + whenUnsatisfiable: ScheduleAnyway + containers: + - name: basic-demo + image: "quay.io/fairwinds/docker-demo:latest" + imagePullPolicy: Always + env: + - name: REFRESH_INTERVAL + value: "500" + - name: TITLE + value: "Kubernetes Demo" + - name: METADATA + value: "" + ports: + - name: http + containerPort: 8080 + protocol: TCP + securityContext: + runAsUser: 1200 + allowPrivilegeEscalation: false + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: / + port: http + resources: + limits: + cpu: 1 + memory: 100Mi + requests: + cpu: 100m + memory: 100Mi +