From 2c2e0886d6b6b892342dced886b1dc01e79b5172 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Thu, 22 Feb 2024 14:56:41 -0300 Subject: [PATCH 01/14] remove Kubernetes Reboot Daemon (Kured) --- 05-bootstrap-prep.md | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/05-bootstrap-prep.md b/05-bootstrap-prep.md index 87b2e39c..bcc83ffd 100644 --- a/05-bootstrap-prep.md +++ b/05-bootstrap-prep.md @@ -57,36 +57,6 @@ In addition to Azure Container Registry being deployed to support bootstrapping, # Get your ACR instance name export ACR_NAME_AKS_BASELINE=$(az deployment group show -g rg-bu0001a0008 -n acr-stamp --query properties.outputs.containerRegistryName.value -o tsv) echo ACR_NAME_AKS_BASELINE: $ACR_NAME_AKS_BASELINE - - # Import core image(s) hosted in public container registries to be used during bootstrapping - az acr import --source ghcr.io/kubereboot/kured:1.15.0 -n $ACR_NAME_AKS_BASELINE - ``` - - > In this walkthrough, there is only one image that is included in the bootstrapping process. It's included as a reference for this process. Your choice to use Kubernetes Reboot Daemon (Kured) or any other images, including Helm charts, as part of your bootstrapping is yours to make. - -1. Update bootstrapping manifests to pull from your Azure Container Registry. *Optional. Fork required.* - - > Your cluster will immediately begin processing the manifests in [`cluster-manifests/`](./cluster-manifests/) due to the bootstrapping configuration that will be applied to it. So, before you deploy the cluster now would be the right time push the following changes to your fork so that it will use your files instead of the files found in the original mspnp repo which point to public container registries: - > - > - update the one `image:` value in [`kured.yaml`](./cluster-manifests/cluster-baseline-settings/kured.yaml) to use your container registry instead of a public container registry. See the comment in the file for instructions (or you can simply run the following command.) - - :warning: Without updating these files and using your own fork, you will be deploying your cluster such that it takes dependencies on public container registries. This is generally okay for exploratory/testing, but not suitable for production. Before going to production, ensure *all* image references you bring to your cluster are from *your* container registry (link imported in the prior step) or another that you feel confident relying on. - - ```bash - sed -i "s:ghcr.io:${ACR_NAME_AKS_BASELINE}.azurecr.io:" ./cluster-manifests/cluster-baseline-settings/kured.yaml - ``` - - Note, that if you are on macOS, you might need to use the following command instead: - - ```bash - sed -i '' 's:ghcr.io:'"${ACR_NAME_AKS_BASELINE}"'.azurecr.io:g' ./cluster-manifests/cluster-baseline-settings/kured.yaml - ``` - - Now commit changes to repository. - - ```bash - git commit -a -m "Update image source to use my ACR instance instead of a public container registry." - git push ``` ### Save your work in-progress From a54f264d4a0c5948a70fbf14c84b253c6d83a1ca Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Thu, 7 Mar 2024 14:44:47 -0300 Subject: [PATCH 02/14] add node-upgrade to docs --- 07-bootstrap-validation.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index c42cbb7c..e0cc440b 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -26,6 +26,15 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their echo AKS_CLUSTER_NAME: $AKS_CLUSTER_NAME ``` +1. Validate there is no available images upgrades. This aks cluster was just installed. Therefore only a race condition between publication of new availble images and thes deployment image fetch could result into a different state. + + ```bash + az aks nodepool get-upgrades -n npuser01 --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 && \ + az aks nodepool show -n npuser01 --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 --query nodeImageVersion + ``` + + > The update phase of the AKS cluster lifecycle bleongs to day2 operations, cluster ops will be regularly updating the node images for two main reasons, the first one is for the Kubernetes cluster version and the second one is to keep up with node-level OS security updates. This can be achieved manually for the greatest degree of control by placing requests against the Azure control plane or alternatevely ops team could opt-in to allways update to the latest available version by configuring a planned maintenance window to perform this automatically. AKS provides with two configurable auto-upgrade channels dedicated to the two oforementioned update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Nodepools in this AKS cluster span into multiple availability zones, so an important consideration is that automatic updates are conducted based on a best-effort zone balancing in node groups. Pod Disruption Budget and Nodes Max Surge are configured in this baseline to increase the Availabilty of the workload and as another attempt to prevent from unbalance zones. + 1. Get AKS `kubectl` credentials. > In the [Microsoft Entra ID Integration](03-microsoft-entra-id.md) step, we placed our cluster under Microsoft Entra group-backed RBAC. This is the first time we are seeing this used. `az aks get-credentials` sets your `kubectl` context so that you can issue commands against your cluster. Even when you have enabled Microsoft Entra ID integration with your AKS cluster, an Azure user has sufficient permissions on the cluster resource can still access your AKS cluster by using the `--admin` switch to this command. Using this switch *bypasses* Microsoft Entra ID and uses client certificate authentication instead; that isn't what we want to happen. So in order to prevent that practice, local account access such as `clusterAdmin` or `clusterMonitoringUser`) is expressly disabled. From 8a2d8c72821a133356757a9ce40b31731a70f1a0 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Thu, 7 Mar 2024 14:49:01 -0300 Subject: [PATCH 03/14] start using node-image upgrade channel to update kubernetes version automatically --- cluster-stamp.bicep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index b0657a61..3fb46ae9 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -1800,7 +1800,7 @@ resource mc 'Microsoft.ContainerService/managedClusters@2023-02-02-preview' = { enabled: false // Using Microsoft Entra Workload IDs for pod identities. } autoUpgradeProfile: { - upgradeChannel: 'stable' + upgradeChannel: 'node-image' } azureMonitorProfile: { metrics: { From 37ac8c47561b4f4a650a5fa8f234b52f9dc89e71 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Thu, 7 Mar 2024 14:50:30 -0300 Subject: [PATCH 04/14] start using Security Patches for OS upgrade channel to get Node OS Level automatically --- 01-prerequisites.md | 7 +++++++ cluster-stamp.bicep | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/01-prerequisites.md b/01-prerequisites.md index c4896c10..0510f90c 100644 --- a/01-prerequisites.md +++ b/01-prerequisites.md @@ -38,8 +38,15 @@ This is the starting point for the instructions on deploying the [AKS baseline r # Keep running until all say "Registered." (This may take up to 20 minutes.) az feature list -o table --query "[?name=='Microsoft.ContainerService/EnableImageCleanerPreview'].{Name:name,State:properties.state}" + # Node OS Level Automatic Security Patches are currently in Preview and requies the following feature + az feature register --namespace "Microsoft.ContainerService" -n "NodeOsUpgradeChannelPreview" + + # Keep running until all say "Registered." (It takes a few minutes for the status to be udated) + az feature show --namespace "Microsoft.ContainerService" --name "NodeOsUpgradeChannelPreview" + # When all say "Registered" then re-register the AKS resource provider az provider register --namespace Microsoft.ContainerService + ``` 1. Clone/download this repo locally, or even better fork this repository. diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index 3fb46ae9..d0d67fa2 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -1640,7 +1640,7 @@ resource pdzAksIngress 'Microsoft.Network/privateDnsZones@2020-06-01' = { } } -resource mc 'Microsoft.ContainerService/managedClusters@2023-02-02-preview' = { +resource mc 'Microsoft.ContainerService/managedClusters@2024-01-01' = { name: clusterName location: location tags: { @@ -1800,6 +1800,7 @@ resource mc 'Microsoft.ContainerService/managedClusters@2023-02-02-preview' = { enabled: false // Using Microsoft Entra Workload IDs for pod identities. } autoUpgradeProfile: { + nodeOSUpgradeChannel: 'SecurityPatch' upgradeChannel: 'node-image' } azureMonitorProfile: { From b620b43071247d86bf7ce3be7d8575c7a85802dd Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Fri, 8 Mar 2024 17:53:36 -0300 Subject: [PATCH 05/14] Address PR Feedback: improve wording and typos Co-authored-by: Jason Bouska <82831332+skabou@users.noreply.github.com> --- 01-prerequisites.md | 4 ++-- 07-bootstrap-validation.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/01-prerequisites.md b/01-prerequisites.md index 0510f90c..37215556 100644 --- a/01-prerequisites.md +++ b/01-prerequisites.md @@ -38,10 +38,10 @@ This is the starting point for the instructions on deploying the [AKS baseline r # Keep running until all say "Registered." (This may take up to 20 minutes.) az feature list -o table --query "[?name=='Microsoft.ContainerService/EnableImageCleanerPreview'].{Name:name,State:properties.state}" - # Node OS Level Automatic Security Patches are currently in Preview and requies the following feature + # Automatic node-level OS security patches are currently in preview and requires the following feature az feature register --namespace "Microsoft.ContainerService" -n "NodeOsUpgradeChannelPreview" - # Keep running until all say "Registered." (It takes a few minutes for the status to be udated) + # Keep running until all say "Registered" (It takes a few minutes for the status to be updated) az feature show --namespace "Microsoft.ContainerService" --name "NodeOsUpgradeChannelPreview" # When all say "Registered" then re-register the AKS resource provider diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index e0cc440b..37051687 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -26,7 +26,7 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their echo AKS_CLUSTER_NAME: $AKS_CLUSTER_NAME ``` -1. Validate there is no available images upgrades. This aks cluster was just installed. Therefore only a race condition between publication of new availble images and thes deployment image fetch could result into a different state. +1. Validate there are no available image upgrades. As this AKS cluster was recently deployed, only a race condition between publication of new available images and the deployment image fetch could result into a different state. ```bash az aks nodepool get-upgrades -n npuser01 --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 && \ From 53178b680316a0781ede3b5ae89787a6298f84fd Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Fri, 8 Mar 2024 17:54:39 -0300 Subject: [PATCH 06/14] Address PR Feedback: use the latest api mc version Co-authored-by: Jason Bouska <82831332+skabou@users.noreply.github.com> --- cluster-stamp.bicep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index d0d67fa2..6bb86ade 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -1640,7 +1640,7 @@ resource pdzAksIngress 'Microsoft.Network/privateDnsZones@2020-06-01' = { } } -resource mc 'Microsoft.ContainerService/managedClusters@2024-01-01' = { +resource mc 'Microsoft.ContainerService/managedClusters@2024-01-02-preview' = { name: clusterName location: location tags: { From 16b44d583358c606b5f2079b4cdb156c5306550b Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Mon, 11 Mar 2024 10:44:34 -0300 Subject: [PATCH 07/14] Revert "start using Security Patches for OS upgrade channel to get Node OS Level automatically" This reverts commit 2218f2e3f1ed91074192ae950229880f63b76500. this is based on a Node channel known bug: Currently, when you set the cluster auto-upgrade channel to node-image, it also automatically sets the node OS auto-upgrade channel to NodeImage. You can't change node OS auto-upgrade channel value if your cluster auto-upgrade channel is node-image. --- 01-prerequisites.md | 7 ------- cluster-stamp.bicep | 1 - 2 files changed, 8 deletions(-) diff --git a/01-prerequisites.md b/01-prerequisites.md index 37215556..c4896c10 100644 --- a/01-prerequisites.md +++ b/01-prerequisites.md @@ -38,15 +38,8 @@ This is the starting point for the instructions on deploying the [AKS baseline r # Keep running until all say "Registered." (This may take up to 20 minutes.) az feature list -o table --query "[?name=='Microsoft.ContainerService/EnableImageCleanerPreview'].{Name:name,State:properties.state}" - # Automatic node-level OS security patches are currently in preview and requires the following feature - az feature register --namespace "Microsoft.ContainerService" -n "NodeOsUpgradeChannelPreview" - - # Keep running until all say "Registered" (It takes a few minutes for the status to be updated) - az feature show --namespace "Microsoft.ContainerService" --name "NodeOsUpgradeChannelPreview" - # When all say "Registered" then re-register the AKS resource provider az provider register --namespace Microsoft.ContainerService - ``` 1. Clone/download this repo locally, or even better fork this repository. diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index 6bb86ade..403cf3cc 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -1800,7 +1800,6 @@ resource mc 'Microsoft.ContainerService/managedClusters@2024-01-02-preview' = { enabled: false // Using Microsoft Entra Workload IDs for pod identities. } autoUpgradeProfile: { - nodeOSUpgradeChannel: 'SecurityPatch' upgradeChannel: 'node-image' } azureMonitorProfile: { From 5677f1eda6327af2eb2f91fcb7ca625d22eb43b9 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Mon, 11 Mar 2024 11:00:12 -0300 Subject: [PATCH 08/14] Use OS NodeImage channel to receive weekly VHD updates --- 07-bootstrap-validation.md | 21 ++++++++++++++++++++- cluster-stamp.bicep | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index 37051687..ef0bade7 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -26,6 +26,25 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their echo AKS_CLUSTER_NAME: $AKS_CLUSTER_NAME ``` +1. Validate the current day2 strategy this baseline follows to upagrade the AKS cluster + + ```bash + az aks show -n $AKS_CLUSTER_NAME -g rg-bu0001a0008 --query "autoUpgradeProfile" + ``` + + ```outcome + { + "nodeOsUpgradeChannel": "NodeImage", + "upgradeChannel": "node-image" + } + ``` + + > This cluster is now receiving OS and Kubernetes updates on weekly bassis. For some workloads where it is imperative to be running always on top of the most secure OS version available, it is possible to opt-in for regular updates by picking up the `SecurityPatch` channel instead. + + > The node update phase of the cluster lifecycle belongs to day2 operations. Cluster ops will update their node images as regular as required for two main reasons, the first one is for the Kubernetes cluster version, and the second one is to keep up with node-level OS updates. A new AKS release will be introducing new features such as new addons as well as making new kubernetes versions available while new AKS Node Images introduce changes at the OS level. Both release types follow Azure Safe Deployments Practices to roll out to all regions. For more information please take a look [How to use the release tracker](https://learn.microsoft.com/azure/aks/release-tracker#how-to-use-the-release-tracker). Additionally, cluster ops want to keep up with supported kubernetes versions for SLA reasons as well as to prevent from piled up updates since version updates can't be skipped at one's discretion. For more information, please take a look at [Kubernetes version upgrades](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#kubernetes-version-upgrades). + + > Once a new update is available, it can be applied manually for the greatest degree of control by placing requests against the Azure control plane. Alternatevely, ops team could opt-in to automatically update to the latest available version by configuring an udpate channel following a desired cadence combining this with a planned maintenance window, one for kubernetes version updates and another one for OS level upgrades. AKS provides with two configurable different auto-upgrade channels dedicated to the oforementioned update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Nodepools in this AKS cluster span into multiple availability zones, so an important consideration is that automatic updates are conducted based on a best-effort zone balancing in node groups. Nodes Max Surge and Pod Disruption Budget are configured in this baseline to prevent from unbalanced zones increasing the Availabilty. By default clusters nodes are updated one at the time. Max Surge has the ability to increase or reduce the speed of a cluster upgrade. In clusters with 4+ nodes hosting worloads that are sensitive to disruptions, it is recommended up to `33%` surge to achieve a safe upgrade pace. For more information, please take a look at [Customer node surge upgrade](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#customize-node-surge-upgrade). To prevent from disruption, production clusters should be configured with [node draining timeout](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-drain-timeout-valuei) and [soak time](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-soak-time-value) by taking into account the specific charactistics of their workloads. + 1. Validate there are no available image upgrades. As this AKS cluster was recently deployed, only a race condition between publication of new available images and the deployment image fetch could result into a different state. ```bash @@ -33,7 +52,7 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their az aks nodepool show -n npuser01 --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 --query nodeImageVersion ``` - > The update phase of the AKS cluster lifecycle bleongs to day2 operations, cluster ops will be regularly updating the node images for two main reasons, the first one is for the Kubernetes cluster version and the second one is to keep up with node-level OS security updates. This can be achieved manually for the greatest degree of control by placing requests against the Azure control plane or alternatevely ops team could opt-in to allways update to the latest available version by configuring a planned maintenance window to perform this automatically. AKS provides with two configurable auto-upgrade channels dedicated to the two oforementioned update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Nodepools in this AKS cluster span into multiple availability zones, so an important consideration is that automatic updates are conducted based on a best-effort zone balancing in node groups. Pod Disruption Budget and Nodes Max Surge are configured in this baseline to increase the Availabilty of the workload and as another attempt to prevent from unbalance zones. + > Typically, base node iamges doesn't contain a suffix with a date (i.e. `AKSUbuntu-2204gen2containerd`). If the `nodeImageVersion` value looks like `AKSUbuntu-2204gen2containerd-202402.26.0` a SecurityPatch or NodeImage upgrade has been applied to the aks node. 1. Get AKS `kubectl` credentials. diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index 403cf3cc..93a90e3c 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -1800,6 +1800,7 @@ resource mc 'Microsoft.ContainerService/managedClusters@2024-01-02-preview' = { enabled: false // Using Microsoft Entra Workload IDs for pod identities. } autoUpgradeProfile: { + nodeOSUpgradeChannel: 'NodeImage' upgradeChannel: 'node-image' } azureMonitorProfile: { From a455f324f1017b65552a15272b95c76a114e72e5 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Mon, 11 Mar 2024 15:50:44 -0300 Subject: [PATCH 09/14] Address PR Feedback: add maintenance configuration window --- 07-bootstrap-validation.md | 8 ++++++++ cluster-stamp.bicep | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index ef0bade7..807880a6 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -45,6 +45,14 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their > Once a new update is available, it can be applied manually for the greatest degree of control by placing requests against the Azure control plane. Alternatevely, ops team could opt-in to automatically update to the latest available version by configuring an udpate channel following a desired cadence combining this with a planned maintenance window, one for kubernetes version updates and another one for OS level upgrades. AKS provides with two configurable different auto-upgrade channels dedicated to the oforementioned update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Nodepools in this AKS cluster span into multiple availability zones, so an important consideration is that automatic updates are conducted based on a best-effort zone balancing in node groups. Nodes Max Surge and Pod Disruption Budget are configured in this baseline to prevent from unbalanced zones increasing the Availabilty. By default clusters nodes are updated one at the time. Max Surge has the ability to increase or reduce the speed of a cluster upgrade. In clusters with 4+ nodes hosting worloads that are sensitive to disruptions, it is recommended up to `33%` surge to achieve a safe upgrade pace. For more information, please take a look at [Customer node surge upgrade](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#customize-node-surge-upgrade). To prevent from disruption, production clusters should be configured with [node draining timeout](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-drain-timeout-valuei) and [soak time](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-soak-time-value) by taking into account the specific charactistics of their workloads. +1. See your maitenance configuration + + ```bash + az aks maintenanceconfiguration list --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 + ``` + + > It is recommended to be prescribed about when upgrades should occur. In case the maitenance windows overlap, AKS decides the running order. Leave at least 24h between maintance window configurations while it will depends based the number of nodes of a particular cluster and the time to upgrade being configured. The OS level updates maintenance window is scheduled to weekly cadence since the OS channel is configured with `NodeImage` where a new node image gets shipped every week. Consider changing this to daily if you opt in for the `SecurityPatch` channel. To stay on top of the kubernetes N-2 version, a monthly cadence is just fine while this cluster is configured every two weeks to keep it updated more regularly. Performing maintenance operations are considered best-effort only and aren't guaranteed to occur within a specified window. While not strictly recommended, for greater level of control consider updating a cluster manually. + 1. Validate there are no available image upgrades. As this AKS cluster was recently deployed, only a race condition between publication of new available images and the deployment image fetch could result into a different state. ```bash diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index 93a90e3c..b299df15 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -1908,6 +1908,39 @@ resource mc 'Microsoft.ContainerService/managedClusters@2024-01-02-preview' = { kvPodMiIngressControllerKeyVaultReader_roleAssignment kvPodMiIngressControllerSecretsUserRole_roleAssignment ] + + resource os_maintenanceConfigurations 'maintenanceConfigurations' = { + name: 'aksManagedNodeOSUpgradeSchedule' + properties: { + maintenanceWindow: { + durationHours: 12 + schedule: { + weekly: { + dayOfWeek: 'Tuesday' + intervalWeeks: 1 + } + } + startTime: '09:00' + } + } + } + + resource k8s_maintenanceConfigurations 'maintenanceConfigurations' = { + name: 'aksManagedAutoUpgradeSchedule' + properties: { + maintenanceWindow: { + durationHours: 12 + schedule: { + weekly: { + dayOfWeek: 'Wednesday' + intervalWeeks: 2 + } + } + startTime: '21:00' + } + } + } + } resource acrKubeletAcrPullRole_roleAssignment 'Microsoft.Authorization/roleAssignments@2020-10-01-preview' = { From 99f98ce38189124220276247dbad4919a923a668 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Mon, 11 Mar 2024 16:06:34 -0300 Subject: [PATCH 10/14] bug fix max surge guidance --- 07-bootstrap-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index 807880a6..0471dd35 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -43,7 +43,7 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their > The node update phase of the cluster lifecycle belongs to day2 operations. Cluster ops will update their node images as regular as required for two main reasons, the first one is for the Kubernetes cluster version, and the second one is to keep up with node-level OS updates. A new AKS release will be introducing new features such as new addons as well as making new kubernetes versions available while new AKS Node Images introduce changes at the OS level. Both release types follow Azure Safe Deployments Practices to roll out to all regions. For more information please take a look [How to use the release tracker](https://learn.microsoft.com/azure/aks/release-tracker#how-to-use-the-release-tracker). Additionally, cluster ops want to keep up with supported kubernetes versions for SLA reasons as well as to prevent from piled up updates since version updates can't be skipped at one's discretion. For more information, please take a look at [Kubernetes version upgrades](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#kubernetes-version-upgrades). - > Once a new update is available, it can be applied manually for the greatest degree of control by placing requests against the Azure control plane. Alternatevely, ops team could opt-in to automatically update to the latest available version by configuring an udpate channel following a desired cadence combining this with a planned maintenance window, one for kubernetes version updates and another one for OS level upgrades. AKS provides with two configurable different auto-upgrade channels dedicated to the oforementioned update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Nodepools in this AKS cluster span into multiple availability zones, so an important consideration is that automatic updates are conducted based on a best-effort zone balancing in node groups. Nodes Max Surge and Pod Disruption Budget are configured in this baseline to prevent from unbalanced zones increasing the Availabilty. By default clusters nodes are updated one at the time. Max Surge has the ability to increase or reduce the speed of a cluster upgrade. In clusters with 4+ nodes hosting worloads that are sensitive to disruptions, it is recommended up to `33%` surge to achieve a safe upgrade pace. For more information, please take a look at [Customer node surge upgrade](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#customize-node-surge-upgrade). To prevent from disruption, production clusters should be configured with [node draining timeout](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-drain-timeout-valuei) and [soak time](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-soak-time-value) by taking into account the specific charactistics of their workloads. + > Once a new update is available, it can be applied manually for the greatest degree of control by placing requests against the Azure control plane. Alternatevely, ops team could opt-in to automatically update to the latest available version by configuring an udpate channel following a desired cadence combining this with a planned maintenance window, one for kubernetes version updates and another one for OS level upgrades. AKS provides with two configurable different auto-upgrade channels dedicated to the oforementioned update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Nodepools in this AKS cluster span into multiple availability zones, so an important consideration is that automatic updates are conducted based on a best-effort zone balancing in node groups. Nodes Max Surge and Pod Disruption Budget are configured in this baseline to prevent from unbalanced zones increasing the Availabilty. By default clusters nodes are updated one at the time. Max Surge has the ability to increase or reduce the speed of a cluster upgrade. In clusters with 6+ nodes hosting worloads that are sensitive to disruptions, it is recommended up to `33%` surge to achieve a safe upgrade pace. For more information, please take a look at [Customer node surge upgrade](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#customize-node-surge-upgrade). To prevent from disruption, production clusters should be configured with [node draining timeout](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-drain-timeout-valuei) and [soak time](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-soak-time-value) by taking into account the specific charactistics of their workloads. 1. See your maitenance configuration From df0a02ad361428456c6652e327dea7603b511b54 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Mon, 11 Mar 2024 18:15:31 -0300 Subject: [PATCH 11/14] Address PR Feedback: improve wording around guidance Co-authored-by: Jason Bouska <82831332+skabou@users.noreply.github.com> --- 07-bootstrap-validation.md | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index 0471dd35..e63a5f51 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -39,11 +39,11 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their } ``` - > This cluster is now receiving OS and Kubernetes updates on weekly bassis. For some workloads where it is imperative to be running always on top of the most secure OS version available, it is possible to opt-in for regular updates by picking up the `SecurityPatch` channel instead. + > This cluster now receives weekly updates for both the Operating System (OS) and Kubernetes. For workloads that need to always run the most secure OS version, you can opt-in for regular updates by selecting the `SecurityPatch` channel. - > The node update phase of the cluster lifecycle belongs to day2 operations. Cluster ops will update their node images as regular as required for two main reasons, the first one is for the Kubernetes cluster version, and the second one is to keep up with node-level OS updates. A new AKS release will be introducing new features such as new addons as well as making new kubernetes versions available while new AKS Node Images introduce changes at the OS level. Both release types follow Azure Safe Deployments Practices to roll out to all regions. For more information please take a look [How to use the release tracker](https://learn.microsoft.com/azure/aks/release-tracker#how-to-use-the-release-tracker). Additionally, cluster ops want to keep up with supported kubernetes versions for SLA reasons as well as to prevent from piled up updates since version updates can't be skipped at one's discretion. For more information, please take a look at [Kubernetes version upgrades](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#kubernetes-version-upgrades). + > The node update phase of the cluster’s lifecycle belongs to day2 operations. Cluster operations will regularly update node images for two main reasons: 1) to update the Kubernetes cluster version, and 2) to keep up with node-level OS updates. A new AKS release introduces new features, such as addons and new Kubernetes versions, while new AKS node images bring changes at the OS level. Both types of releases adhere to Azure Safe Deployment Practices for rollout across all regions. For more information, please refer to [How to use the release tracker](https://learn.microsoft.com/azure/aks/release-tracker#how-to-use-the-release-tracker). Additionally, cluster operations aim to stay updated with supported Kubernetes versions for Service Level Agreement (SLA) compliance and to avoid accumulating updates, as version updates cannot be skipped at will. For more details, please see [Kubernetes version upgrades](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#kubernetes-version-upgrades). - > Once a new update is available, it can be applied manually for the greatest degree of control by placing requests against the Azure control plane. Alternatevely, ops team could opt-in to automatically update to the latest available version by configuring an udpate channel following a desired cadence combining this with a planned maintenance window, one for kubernetes version updates and another one for OS level upgrades. AKS provides with two configurable different auto-upgrade channels dedicated to the oforementioned update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Nodepools in this AKS cluster span into multiple availability zones, so an important consideration is that automatic updates are conducted based on a best-effort zone balancing in node groups. Nodes Max Surge and Pod Disruption Budget are configured in this baseline to prevent from unbalanced zones increasing the Availabilty. By default clusters nodes are updated one at the time. Max Surge has the ability to increase or reduce the speed of a cluster upgrade. In clusters with 6+ nodes hosting worloads that are sensitive to disruptions, it is recommended up to `33%` surge to achieve a safe upgrade pace. For more information, please take a look at [Customer node surge upgrade](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#customize-node-surge-upgrade). To prevent from disruption, production clusters should be configured with [node draining timeout](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-drain-timeout-valuei) and [soak time](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-soak-time-value) by taking into account the specific charactistics of their workloads. + > When a new update becomes available, it can be manually applied for the greatest degree of control by making requests against the Azure control plane. Alternatively, the operations team can opt to automatically update to the latest version by configuring an update channel to follow the desired cadence. This can be combined with a planned maintenance window, one for Kubernetes version updates and another for OS-level upgrades. AKS offers two different configurable auto-upgrade channels dedicated to these update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Node pools in this AKS cluster span multiple availability zones. Therefore, it’s important to note that automatic updates are conducted based on a best-effort zone balancing in node groups. To prevent zone imbalance and increase availability, Nodes Max Surge and Pod Disruption Budget are configured in this baseline. By default, cluster nodes are updated one at a time. Max Surge can adjust the speed of a cluster upgrade. In clusters with 6+ nodes hosting disruption-sensitive workloads, a surge of up to `33%` is recommended for a safe upgrade pace. For more information, please see [Customer node surge upgrade](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#customize-node-surge-upgrade). To minimize disruption, production clusters should be configured with [node draining timeout](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-drain-timeout-valuei) and [soak time](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-soak-time-value), taking into account the specific characteristics of their workloads. 1. See your maitenance configuration @@ -51,7 +51,25 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their az aks maintenanceconfiguration list --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 ``` - > It is recommended to be prescribed about when upgrades should occur. In case the maitenance windows overlap, AKS decides the running order. Leave at least 24h between maintance window configurations while it will depends based the number of nodes of a particular cluster and the time to upgrade being configured. The OS level updates maintenance window is scheduled to weekly cadence since the OS channel is configured with `NodeImage` where a new node image gets shipped every week. Consider changing this to daily if you opt in for the `SecurityPatch` channel. To stay on top of the kubernetes N-2 version, a monthly cadence is just fine while this cluster is configured every two weeks to keep it updated more regularly. Performing maintenance operations are considered best-effort only and aren't guaranteed to occur within a specified window. While not strictly recommended, for greater level of control consider updating a cluster manually. +> When managing an Azure Kubernetes Service (AKS) cluster, it is crucial to plan your upgrades thoughtfully. Here are some recommendations to consider: +> +> Mindful Timing for Upgrades: +> - Be mindful of when upgrades should occur. If you have overlapping maintenance windows, AKS will determine the running order. +> - To avoid conflicts, leave at least 24 hours between maintenance window configurations. The timing will depend on the number of nodes in your specific cluster and the duration required for upgrades. +> +> OS-Level Updates: +> - By default, the OS-level updates maintenance window is scheduled on a weekly cadence. This is because the OS channel is configured with `NodeImage`, where a new node image is shipped every week. +> - If you choose the `SecurityPatch` channel, consider changing the maintenance window to daily for more frequent updates. +> +> Kubernetes Version Management: +> - To stay current with the latest Kubernetes version, a monthly cadence is generally sufficient. However, you can adjust this based on your specific needs. +> - For more regular updates, configure your cluster to upgrade every two weeks. +> +> Maintenance Operations: +> - Keep in mind that performing maintenance operations is considered best-effort. They are not guaranteed to occur within a specific window. +> - While it’s not strictly recommended, if you require greater control, consider manually updating your cluster. +> +> Remember that these guidelines provide flexibility, allowing you to strike a balance between timely updates and operational control. Choose the approach that aligns best with your organization’s requirements. 1. Validate there are no available image upgrades. As this AKS cluster was recently deployed, only a race condition between publication of new available images and the deployment image fetch could result into a different state. From e99adff4dc42327f7f981ea96a15d96c440407a2 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Wed, 20 Mar 2024 13:17:36 -0300 Subject: [PATCH 12/14] feat (cluster): [day2-ops] image update configuration node-level only (#405) --- 07-bootstrap-validation.md | 51 ++++---------------------------------- cluster-stamp.bicep | 19 +------------- 2 files changed, 6 insertions(+), 64 deletions(-) diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index e63a5f51..a0ff8273 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -26,51 +26,6 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their echo AKS_CLUSTER_NAME: $AKS_CLUSTER_NAME ``` -1. Validate the current day2 strategy this baseline follows to upagrade the AKS cluster - - ```bash - az aks show -n $AKS_CLUSTER_NAME -g rg-bu0001a0008 --query "autoUpgradeProfile" - ``` - - ```outcome - { - "nodeOsUpgradeChannel": "NodeImage", - "upgradeChannel": "node-image" - } - ``` - - > This cluster now receives weekly updates for both the Operating System (OS) and Kubernetes. For workloads that need to always run the most secure OS version, you can opt-in for regular updates by selecting the `SecurityPatch` channel. - - > The node update phase of the cluster’s lifecycle belongs to day2 operations. Cluster operations will regularly update node images for two main reasons: 1) to update the Kubernetes cluster version, and 2) to keep up with node-level OS updates. A new AKS release introduces new features, such as addons and new Kubernetes versions, while new AKS node images bring changes at the OS level. Both types of releases adhere to Azure Safe Deployment Practices for rollout across all regions. For more information, please refer to [How to use the release tracker](https://learn.microsoft.com/azure/aks/release-tracker#how-to-use-the-release-tracker). Additionally, cluster operations aim to stay updated with supported Kubernetes versions for Service Level Agreement (SLA) compliance and to avoid accumulating updates, as version updates cannot be skipped at will. For more details, please see [Kubernetes version upgrades](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#kubernetes-version-upgrades). - - > When a new update becomes available, it can be manually applied for the greatest degree of control by making requests against the Azure control plane. Alternatively, the operations team can opt to automatically update to the latest version by configuring an update channel to follow the desired cadence. This can be combined with a planned maintenance window, one for Kubernetes version updates and another for OS-level upgrades. AKS offers two different configurable auto-upgrade channels dedicated to these update types. For more information, please refer to [Upgrade options for Azure Kubernetes Service (AKS) clusters](https://learn.microsoft.com/azure/aks/upgrade-cluster). Node pools in this AKS cluster span multiple availability zones. Therefore, it’s important to note that automatic updates are conducted based on a best-effort zone balancing in node groups. To prevent zone imbalance and increase availability, Nodes Max Surge and Pod Disruption Budget are configured in this baseline. By default, cluster nodes are updated one at a time. Max Surge can adjust the speed of a cluster upgrade. In clusters with 6+ nodes hosting disruption-sensitive workloads, a surge of up to `33%` is recommended for a safe upgrade pace. For more information, please see [Customer node surge upgrade](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#customize-node-surge-upgrade). To minimize disruption, production clusters should be configured with [node draining timeout](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-drain-timeout-valuei) and [soak time](https://learn.microsoft.com/azure/aks/upgrade-aks-cluster?tabs=azure-cli#set-node-soak-time-value), taking into account the specific characteristics of their workloads. - -1. See your maitenance configuration - - ```bash - az aks maintenanceconfiguration list --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 - ``` - -> When managing an Azure Kubernetes Service (AKS) cluster, it is crucial to plan your upgrades thoughtfully. Here are some recommendations to consider: -> -> Mindful Timing for Upgrades: -> - Be mindful of when upgrades should occur. If you have overlapping maintenance windows, AKS will determine the running order. -> - To avoid conflicts, leave at least 24 hours between maintenance window configurations. The timing will depend on the number of nodes in your specific cluster and the duration required for upgrades. -> -> OS-Level Updates: -> - By default, the OS-level updates maintenance window is scheduled on a weekly cadence. This is because the OS channel is configured with `NodeImage`, where a new node image is shipped every week. -> - If you choose the `SecurityPatch` channel, consider changing the maintenance window to daily for more frequent updates. -> -> Kubernetes Version Management: -> - To stay current with the latest Kubernetes version, a monthly cadence is generally sufficient. However, you can adjust this based on your specific needs. -> - For more regular updates, configure your cluster to upgrade every two weeks. -> -> Maintenance Operations: -> - Keep in mind that performing maintenance operations is considered best-effort. They are not guaranteed to occur within a specific window. -> - While it’s not strictly recommended, if you require greater control, consider manually updating your cluster. -> -> Remember that these guidelines provide flexibility, allowing you to strike a balance between timely updates and operational control. Choose the approach that aligns best with your organization’s requirements. - 1. Validate there are no available image upgrades. As this AKS cluster was recently deployed, only a race condition between publication of new available images and the deployment image fetch could result into a different state. ```bash @@ -78,7 +33,11 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their az aks nodepool show -n npuser01 --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 --query nodeImageVersion ``` - > Typically, base node iamges doesn't contain a suffix with a date (i.e. `AKSUbuntu-2204gen2containerd`). If the `nodeImageVersion` value looks like `AKSUbuntu-2204gen2containerd-202402.26.0` a SecurityPatch or NodeImage upgrade has been applied to the aks node. + > Typically, base node iamges doesn't contain a suffix with a date (i.e. `AKSUbuntu-2204gen2containerd`). If the `nodeImageVersion` value looks like `AKSUbuntu-2204gen2containerd-202402.26.0` a SecurityPatch or NodeImage upgrade has been applied to the AKS node. + + > The AKS nodes are configured to automatically receives weekly image updates including security patches, kernel and other node related stuff. AKS cluster version won't be automatically updated since production cluster should be manually updated after testing in lower environments. + + > Node image updates are shipped on a weekly default cadence. The maintenance window of this AKS cluster for node image updates is configured every Tuesday at 9PM. If that node image is released out of this maintenance window, the nodes will catchup on the following ocurrence. AKS nodes that require to be more frequently updated could consider changing its auto-upgrade channel to `SecurityPatch` and configure a daily maintenance window. 1. Get AKS `kubectl` credentials. diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index b299df15..4de6304a 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -1801,7 +1801,7 @@ resource mc 'Microsoft.ContainerService/managedClusters@2024-01-02-preview' = { } autoUpgradeProfile: { nodeOSUpgradeChannel: 'NodeImage' - upgradeChannel: 'node-image' + upgradeChannel: 'none' } azureMonitorProfile: { metrics: { @@ -1920,27 +1920,10 @@ resource mc 'Microsoft.ContainerService/managedClusters@2024-01-02-preview' = { intervalWeeks: 1 } } - startTime: '09:00' - } - } - } - - resource k8s_maintenanceConfigurations 'maintenanceConfigurations' = { - name: 'aksManagedAutoUpgradeSchedule' - properties: { - maintenanceWindow: { - durationHours: 12 - schedule: { - weekly: { - dayOfWeek: 'Wednesday' - intervalWeeks: 2 - } - } startTime: '21:00' } } } - } resource acrKubeletAcrPullRole_roleAssignment 'Microsoft.Authorization/roleAssignments@2020-10-01-preview' = { From 864599661542c677027d08b76f890b222b66bd0a Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Wed, 20 Mar 2024 16:27:46 -0300 Subject: [PATCH 13/14] Address PR Feedback: improve wording and typos Co-authored-by: Jason Bouska <82831332+skabou@users.noreply.github.com> --- 07-bootstrap-validation.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index a0ff8273..d43cf28d 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -33,11 +33,11 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their az aks nodepool show -n npuser01 --cluster-name $AKS_CLUSTER_NAME -g rg-bu0001a0008 --query nodeImageVersion ``` - > Typically, base node iamges doesn't contain a suffix with a date (i.e. `AKSUbuntu-2204gen2containerd`). If the `nodeImageVersion` value looks like `AKSUbuntu-2204gen2containerd-202402.26.0` a SecurityPatch or NodeImage upgrade has been applied to the AKS node. + > Typically, base node images don't contain a suffix with a date (i.e. `AKSUbuntu-2204gen2containerd`). If the `nodeImageVersion` value looks like `AKSUbuntu-2204gen2containerd-202402.26.0` a SecurityPatch or NodeImage upgrade has been applied to the AKS node. - > The AKS nodes are configured to automatically receives weekly image updates including security patches, kernel and other node related stuff. AKS cluster version won't be automatically updated since production cluster should be manually updated after testing in lower environments. + > The AKS nodes are configured to receive weekly updates automatically which include security patches, kernel updates, and node images updates. The AKS cluster version won't be updated automatically since production clusters should be updated manually after testing in lower environments. - > Node image updates are shipped on a weekly default cadence. The maintenance window of this AKS cluster for node image updates is configured every Tuesday at 9PM. If that node image is released out of this maintenance window, the nodes will catchup on the following ocurrence. AKS nodes that require to be more frequently updated could consider changing its auto-upgrade channel to `SecurityPatch` and configure a daily maintenance window. + > Node image updates are shipped on a weekly cadence by default. This AKS cluster is configured to have its maintenance window for node image updates every Tuesday at 9PM. If a node image is released outside of this maintenance window, the nodes will be updated on the next scheduled occurrence. For AKS nodes that require more frequent updates, consider changing the auto-upgrade channel to `SecurityPatch` and configuring a daily maintenance window. 1. Get AKS `kubectl` credentials. From fd303d261012fb84ca218fb82560062b0fea3370 Mon Sep 17 00:00:00 2001 From: Fernando Antivero Date: Wed, 20 Mar 2024 16:58:48 -0300 Subject: [PATCH 14/14] remove pendent kured resources/content --- 07-bootstrap-validation.md | 2 - cluster-manifests/README.md | 5 - .../cluster-baseline-settings/kured.yaml | 183 ------------------ cluster-stamp.bicep | 33 +--- 4 files changed, 3 insertions(+), 220 deletions(-) delete mode 100644 cluster-manifests/cluster-baseline-settings/kured.yaml diff --git a/07-bootstrap-validation.md b/07-bootstrap-validation.md index d43cf28d..9b54340d 100644 --- a/07-bootstrap-validation.md +++ b/07-bootstrap-validation.md @@ -65,11 +65,9 @@ GitOps allows a team to author Kubernetes manifest files, persist them in their The bootstrapping process that already happened due to the usage of the Flux extension for AKS has set up the following, amoung other things - the workload's namespace named `a0008` - - installed kured ```bash kubectl get namespaces - kubectl get all -n cluster-baseline-settings ``` These commands will show you results that were due to the automatic bootstrapping process your cluster experienced due to the Flux GitOps extension. This content mirrors the content found in [`cluster-manifests`](./cluster-manifests), and commits made there will reflect in your cluster within minutes of making the change. diff --git a/cluster-manifests/README.md b/cluster-manifests/README.md index bd5675ea..477e9b6f 100644 --- a/cluster-manifests/README.md +++ b/cluster-manifests/README.md @@ -8,14 +8,9 @@ This is the root of the GitOps configuration directory. These Kubernetes object - Default Namespaces - Kubernetes RBAC Role Assignments (cluster and namespace) to Microsoft Entra groups. *Optional* -- [Kured](#kured) - Ingress Network Policy - Azure Monitor Prometheus Scraping -### Kured - -Kured is included as a solution to handle occasional required reboots from daily OS patching. This open-source software component is only needed if you require a managed rebooting solution between weekly [node image upgrades](https://learn.microsoft.com/azure/aks/node-image-upgrade). Building a process around deploying node image upgrades [every week](https://github.com/Azure/AKS/releases) satisfies most organizational weekly patching cadence requirements. Combined with most security patches on Linux not requiring reboots often, this leaves your cluster in a well supported state. If weekly node image upgrades satisfies your business requirements, then remove Kured from this solution by deleting [`kured.yaml`](./cluster-baseline-settings/kured.yaml). If however weekly patching using node image upgrades is not sufficient and you need to respond to daily security updates that mandate a reboot ASAP, then using a solution like Kured will help you achieve that objective. **Kured is not supported by Microsoft Support.** - ## Private bootstrapping repository Typically, your bootstrapping repository wouldn't be a public-facing repository like this one, but instead a private GitHub or Azure DevOps repo. The Flux operator deployed with the cluster supports private Git repositories as your bootstrapping source. In addition to requiring network line of sight to the repository from your cluster's nodes, you'll also need to ensure that you've provided the necessary credentials. This can come, typically, in the form of certificate-based SSH or personal access tokens (PAT), both ideally scoped as read-only to the repo with no additional permissions. diff --git a/cluster-manifests/cluster-baseline-settings/kured.yaml b/cluster-manifests/cluster-baseline-settings/kured.yaml deleted file mode 100644 index bb2f82a4..00000000 --- a/cluster-manifests/cluster-baseline-settings/kured.yaml +++ /dev/null @@ -1,183 +0,0 @@ -# Source: https://github.com/kubereboot/charts/tree/kured-5.2.0/charts/kured (1.15.0) -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kured -rules: -# Allow kured to read spec.unschedulable -# Allow kubectl to drain/uncordon -# -# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below -# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go -# -- apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "patch"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["list","delete","get"] -- apiGroups: ["extensions"] - resources: ["daemonsets"] - verbs: ["get"] -- apiGroups: ["apps"] - resources: ["daemonsets"] - verbs: ["get"] -- apiGroups: [""] - resources: ["pods/eviction"] - verbs: ["create"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: kured -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kured -subjects: -- kind: ServiceAccount - name: kured - namespace: cluster-baseline-settings ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: cluster-baseline-settings - name: kured -rules: - # Allow kured to lock/unlock itself - - apiGroups: ["extensions"] - resources: ["daemonsets"] - resourceNames: ["kured"] - verbs: ["update", "patch"] - - apiGroups: ["apps"] - resources: ["daemonsets"] - resourceNames: ["kured"] - verbs: ["update", "patch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - namespace: cluster-baseline-settings - name: kured -subjects: -- kind: ServiceAccount - namespace: cluster-baseline-settings - name: kured -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: kured ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kured - namespace: cluster-baseline-settings ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: kured # Must match `--ds-name` - namespace: cluster-baseline-settings # Must match `--ds-namespace` -spec: - revisionHistoryLimit: 10 - selector: - matchLabels: - app.kubernetes.io/name: kured - updateStrategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 1 - template: - metadata: - labels: - app.kubernetes.io/name: kured - annotations: - prometheus.io/scrape: "true" - prometheus.io/path: "/metrics" - prometheus.io/port: "8080" - spec: - serviceAccountName: kured - tolerations: - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: CriticalAddonsOnly - effect: NoSchedule - operator: Equal - value: "true" - hostNetwork: true - hostPID: true # Facilitate entering the host mount namespace via init - restartPolicy: Always - nodeSelector: - kubernetes.io/arch: amd64 - kubernetes.io/os: linux - containers: - - name: kured - # PRODUCTION READINESS CHANGE REQUIRED - # This image should be sourced from a non-public container registry, such as the - # one deployed along side of this reference implementation. - # az acr import --source ghcr.io/kubereboot/kured:1.15.0 -n - # and then set this to - # image: .azurecr.io/kubereboot/kured:1.15.0 - image: ghcr.io/kubereboot/kured:1.15.0 - imagePullPolicy: IfNotPresent - securityContext: - privileged: true # Give permission to nsenter /proc/1/ns/mnt - resources: - limits: - cpu: 500m - memory: 48Mi - requests: - cpu: 200m - memory: 16Mi - ports: - - containerPort: 8080 - name: metrics - env: - # Pass in the name of the node on which this pod is scheduled - # for use with drain/uncordon operations and lock acquisition - - name: KURED_NODE_ID - valueFrom: - fieldRef: - fieldPath: spec.nodeName - command: - - /usr/bin/kured - args: - - --ds-namespace=cluster-baseline-settings -# - --ds-name=kured -# - --reboot-command=/bin/systemctl reboot -# - --force-reboot=false -# - --drain-grace-period=-1 -# - --skip-wait-for-delete-timeout=0 -# - --drain-timeout=0 -# - --period=1h -# - --ds-name=kured -# - --lock-annotation=weave.works/kured-node-lock -# - --lock-ttl=0 -# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local -# - --alert-filter-regexp=^RebootRequired$ -# - --alert-firing-only=false -# - --reboot-sentinel=/var/run/reboot-required -# - --prefer-no-schedule-taint="" -# - --reboot-sentinel-command="" -# - --slack-hook-url=https://hooks.slack.com/... -# - --slack-username=prod -# - --slack-channel=alerting -# - --notify-url="" # See also shoutrrr url format -# - --message-template-drain=Draining node %s -# - --message-template-reboot=Rebooting node %s -# - --message-template-uncordon=Node %s rebooted & uncordoned successfully! -# - --blocking-pod-selector=runtime=long,cost=expensive -# - --blocking-pod-selector=name=temperamental -# - --blocking-pod-selector=... -# - --reboot-days=sun,mon,tue,wed,thu,fri,sat -# - --reboot-delay=90s -# - --start-time=0:00 -# - --end-time=23:59:59 -# - --time-zone=UTC -# - --annotate-nodes=false -# - --lock-release-delay=30m -# - --log-format=text \ No newline at end of file diff --git a/cluster-stamp.bicep b/cluster-stamp.bicep index 4de6304a..46efbe9c 100644 --- a/cluster-stamp.bicep +++ b/cluster-stamp.bicep @@ -263,23 +263,6 @@ resource qPrometheusAll 'Microsoft.OperationalInsights/queryPacks/queries@2019-0 } } -// Example query that shows the usage of a specific Prometheus metric emitted by Kured -resource qNodeReboots 'Microsoft.OperationalInsights/queryPacks/queries@2019-09-01' = { - parent: qpBaselineQueryPack - name: guid(resourceGroup().id, 'KuredNodeReboot', clusterName) - properties: { - displayName: 'Kubenertes node reboot requested' - description: 'Which Kubernetes nodes are flagged for reboot (based on Prometheus metrics).' - body: 'InsightsMetrics | where Namespace == "prometheus" and Name == "kured_reboot_required" | where Val > 0' - related: { - categories: [ - 'container' - 'management' - ] - } - } -} - resource sci 'Microsoft.OperationsManagement/solutions@2015-11-01-preview' = { name: 'ContainerInsights(${la.name})' location: location @@ -961,15 +944,6 @@ resource paAKSLinuxRestrictive 'Microsoft.Authorization/policyAssignments@2021-0 'azure-arc' 'flux-system' - // Known violations - // K8sAzureAllowedSeccomp - // - Kured, no profile defined - // K8sAzureContainerNoPrivilege - // - Kured, requires privileged to perform reboot - // K8sAzureBlockHostNamespaceV2 - // - Kured, shared host namespace - // K8sAzureAllowedUsersGroups - // - Kured, no runAsNonRoot, no runAsGroup, no supplementalGroups, no fsGroup 'cluster-baseline-settings' // Known violations @@ -1054,7 +1028,6 @@ resource paRoRootFilesystem 'Microsoft.Authorization/policyAssignments@2021-06-0 } excludedContainers: { value: [ - 'kured' // Kured 'aspnet-webapp-sample' // ASP.NET Core does not support read-only root ] } @@ -1078,10 +1051,10 @@ resource paEnforceResourceLimits 'Microsoft.Authorization/policyAssignments@2021 policyDefinitionId: pdEnforceResourceLimitsId parameters: { cpuLimit: { - value: '500m' // Kured = 500m, traefik-ingress-controller = 200m, aspnet-webapp-sample = 100m + value: '500m' // traefik-ingress-controller = 200m, aspnet-webapp-sample = 100m } memoryLimit: { - value: '256Mi' // aspnet-webapp-sample = 256Mi, traefik-ingress-controller = 128Mi, Kured = 48Mi + value: '256Mi' // aspnet-webapp-sample = 256Mi, traefik-ingress-controller = 128Mi } excludedNamespaces: { value: [ @@ -1111,7 +1084,7 @@ resource paEnforceImageSource 'Microsoft.Authorization/policyAssignments@2021-06 parameters: { allowedContainerImagesRegex: { // If all images are pull into your ARC instance as described in these instructions you can remove the docker.io & ghcr.io entries. - value: '${acr.name}\\.azurecr\\.io/.+$|mcr\\.microsoft\\.com/.+$|ghcr\\.io/kubereboot/kured.+$|docker\\.io/library/.+$' + value: '${acr.name}\\.azurecr\\.io/.+$|mcr\\.microsoft\\.com/.+$|docker\\.io/library/.+$' } excludedNamespaces: { value: [