From fbc82e70c2a7d2e3a912027ff11ae33430ec071d Mon Sep 17 00:00:00 2001 From: Francisco Augusto Date: Fri, 2 Feb 2024 18:10:17 +0100 Subject: [PATCH] Disable azure cloud routes & fix azure csi drivers upgrade (#421) * Disable azure cloud routes & fix azure csi drivers upgrade * Clean code * Update upgrad script to cluster-operator pre release * Fix azurefile csi driver upgrade * Remove not working code * Clean code * Scale cloud-controller-manager to 2 replicas * Fix capz * Remove untested code * Fix cloud-controller-manager procedure --------- Co-authored-by: stg <65890694+stg-0@users.noreply.github.com> --- scripts/upgrade-provisioner_v0.3.py | 223 +++++++++++++++++++--------- 1 file changed, 151 insertions(+), 72 deletions(-) diff --git a/scripts/upgrade-provisioner_v0.3.py b/scripts/upgrade-provisioner_v0.3.py index 1d744e92bb..8a87608b72 100644 --- a/scripts/upgrade-provisioner_v0.3.py +++ b/scripts/upgrade-provisioner_v0.3.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# TODO: Don't prepare capsule if doesn't exist - ############################################################## # Author: Stratio Clouds # # Date: 14/11/2023 # @@ -32,7 +30,7 @@ CALICO_NODE_VERSION = "v1.30.5" AZUREDISK_CSI_DRIVER_CHART = "v1.28.3" AZUREFILE_CSI_DRIVER_CHART = "v1.28.3" -CLOUD_PROVIDER_AZURE_CHART = "v1.28.0" +CLOUD_PROVIDER_AZURE_CHART = "v1.26.7" CLUSTER_OPERATOR = "0.1.7" CLOUD_PROVISIONER = "0.17.0-0.3.7" @@ -92,43 +90,85 @@ def backup(backup_dir, namespace, cluster_name): # Backup capsule files os.makedirs(backup_dir + "/capsule", exist_ok=True) - command = kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration -o yaml 2>/dev/null > " + backup_dir + "/capsule/capsule-mutating-webhook-configuration.yaml" - status, output = subprocess.getstatusoutput(command) - if status != 0: - print("[ERROR] Backing up capsule files failed:\n" + output) - sys.exit(1) - command = kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration -o yaml 2>/dev/null > " + backup_dir + "/capsule/capsule-validating-webhook-configuration.yaml" - status, output = subprocess.getstatusoutput(command) - if status != 0: - print("[ERROR] Backing up capsule files failed:\n" + output) - sys.exit(1) + command = kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration" + status, _ = subprocess.getstatusoutput(command) + if status == 0: + command = kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration -o yaml 2>/dev/null > " + backup_dir + "/capsule/capsule-mutating-webhook-configuration.yaml" + status, output = subprocess.getstatusoutput(command) + if status != 0: + print("[ERROR] Backing up capsule files failed:\n" + output) + sys.exit(1) + command = kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration" + status, _ = subprocess.getstatusoutput(command) + if status == 0: + command = kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration -o yaml 2>/dev/null > " + backup_dir + "/capsule/capsule-validating-webhook-configuration.yaml" + status, output = subprocess.getstatusoutput(command) + if status != 0: + print("[ERROR] Backing up capsule files failed:\n" + output) + sys.exit(1) def prepare_capsule(dry_run): print("[INFO] Preparing capsule-mutating-webhook-configuration for the upgrade process:", end =" ", flush=True) - command = (kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration -o json | " + - '''jq -r '.webhooks[0].objectSelector |= {"matchExpressions":[{"key":"name","operator":"NotIn","values":["kube-system","tigera-operator","calico-system","cert-manager","capi-system","''' + - namespace + '''","capi-kubeadm-bootstrap-system","capi-kubeadm-control-plane-system"]},{"key":"kubernetes.io/metadata.name","operator":"NotIn","values":["kube-system","tigera-operator","calico-system","cert-manager","capi-system","''' + - namespace + '''","capi-kubeadm-bootstrap-system","capi-kubeadm-control-plane-system"]}]}' | ''' + kubectl + " apply -f -") - execute_command(command, dry_run) + command = kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration" + status, output = subprocess.getstatusoutput(command) + if status != 0: + if "NotFound" in output: + print("SKIP") + else: + print("[ERROR] Preparing capsule-mutating-webhook-configuration failed:\n" + output) + sys.exit(1) + else: + command = (kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration -o json | " + + '''jq -r '.webhooks[0].objectSelector |= {"matchExpressions":[{"key":"name","operator":"NotIn","values":["kube-system","tigera-operator","calico-system","cert-manager","capi-system","''' + + namespace + '''","capi-kubeadm-bootstrap-system","capi-kubeadm-control-plane-system"]},{"key":"kubernetes.io/metadata.name","operator":"NotIn","values":["kube-system","tigera-operator","calico-system","cert-manager","capi-system","''' + + namespace + '''","capi-kubeadm-bootstrap-system","capi-kubeadm-control-plane-system"]}]}' | ''' + kubectl + " apply -f -") + execute_command(command, dry_run) print("[INFO] Preparing capsule-validating-webhook-configuration for the upgrade process:", end =" ", flush=True) - command = (kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration -o json | " + - '''jq -r '.webhooks[] |= (select(.name == "namespaces.capsule.clastix.io").objectSelector |= ({"matchExpressions":[{"key":"name","operator":"NotIn","values":["''' + - namespace + '''","tigera-operator","calico-system"]},{"key":"kubernetes.io/metadata.name","operator":"NotIn","values":["''' + - namespace + '''","tigera-operator","calico-system"]}]}))' | ''' + kubectl + " apply -f -") - execute_command(command, dry_run) + command = kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration" + status, _ = subprocess.getstatusoutput(command) + if status != 0: + if "NotFound" in output: + print("SKIP") + else: + print("[ERROR] Preparing capsule-validating-webhook-configuration failed:\n" + output) + sys.exit(1) + else: + command = (kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration -o json | " + + '''jq -r '.webhooks[] |= (select(.name == "namespaces.capsule.clastix.io").objectSelector |= ({"matchExpressions":[{"key":"name","operator":"NotIn","values":["''' + + namespace + '''","tigera-operator","calico-system"]},{"key":"kubernetes.io/metadata.name","operator":"NotIn","values":["''' + + namespace + '''","tigera-operator","calico-system"]}]}))' | ''' + kubectl + " apply -f -") + execute_command(command, dry_run) def restore_capsule(dry_run): print("[INFO] Restoring capsule-mutating-webhook-configuration:", end =" ", flush=True) - command = (kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration -o json | " + - "jq -r '.webhooks[0].objectSelector |= {}' | " + kubectl + " apply -f -") - execute_command(command, dry_run) + command = kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration" + status, output = subprocess.getstatusoutput(command) + if status != 0: + if "NotFound" in output: + print("SKIP") + else: + print("[ERROR] Restoring capsule-mutating-webhook-configuration failed:\n" + output) + sys.exit(1) + else: + command = (kubectl + " get mutatingwebhookconfigurations capsule-mutating-webhook-configuration -o json | " + + "jq -r '.webhooks[0].objectSelector |= {}' | " + kubectl + " apply -f -") + execute_command(command, dry_run) print("[INFO] Restoring capsule-validating-webhook-configuration:", end =" ", flush=True) - command = (kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration -o json | " + - """jq -r '.webhooks[] |= (select(.name == "namespaces.capsule.clastix.io").objectSelector |= {})' """ + - "| " + kubectl + " apply -f -") - execute_command(command, dry_run) + command = kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration" + status, _ = subprocess.getstatusoutput(command) + if status != 0: + if "NotFound" in output: + print("SKIP") + else: + print("[ERROR] Restoring capsule-validating-webhook-configuration failed:\n" + output) + sys.exit(1) + else: + command = (kubectl + " get validatingwebhookconfigurations capsule-validating-webhook-configuration -o json | " + + """jq -r '.webhooks[] |= (select(.name == "namespaces.capsule.clastix.io").objectSelector |= {})' """ + + "| " + kubectl + " apply -f -") + execute_command(command, dry_run) def add_pdbs(provider, namespace, dry_run): pdb = "" @@ -384,77 +424,119 @@ def upgrade_capx(kubeconfig, provider, namespace, version, env_vars, dry_run): command = kubectl + " -n capi-kubeadm-bootstrap-system scale --replicas " + replicas + " deploy capi-kubeadm-bootstrap-controller-manager" execute_command(command, dry_run) -def upgrade_drivers(dry_run): +def upgrade_drivers(cluster, cluster_name, dry_run): # Azuredisk CSI driver print("[INFO] Upgrading Azuredisk CSI driver to " + AZUREDISK_CSI_DRIVER_CHART + ":", end =" ", flush=True) chart_version = subprocess.getstatusoutput(helm + " list -A | grep azuredisk-csi-driver")[1].split()[9] if chart_version == AZUREDISK_CSI_DRIVER_CHART: print("SKIP") else: + chart_values = subprocess.getoutput(helm + " -n kube-system get values azuredisk-csi-driver -o json") + f = open('./azurediskcsidriver.values', 'w') + f.write(chart_values) + f.close() command = (helm + " -n kube-system upgrade azuredisk-csi-driver azuredisk-csi-driver" + - " --wait --reset-values --version " + AZUREDISK_CSI_DRIVER_CHART + + " --wait --version " + AZUREDISK_CSI_DRIVER_CHART + " --values ./azurediskcsidriver.values" + " --set controller.podAnnotations.\"cluster-autoscaler\\.kubernetes\\.io/safe-to-evict-local-volumes=socket-dir\\,azure-cred\"" + " --repo https://raw.githubusercontent.com/kubernetes-sigs/azuredisk-csi-driver/master/charts") execute_command(command, dry_run) + os.remove("./azurediskcsidriver.values") # Azurefile CSI driver status, output = subprocess.getstatusoutput(helm + " list -A | grep azurefile-csi-driver") if status == 0: - print("[INFO] Upgrading Azurefile CSI driver to " + AZUREFILE_CSI_DRIVER_CHART + ":", end =" ", flush=True) chart_version = output.split()[9] - if chart_version == AZUREFILE_CSI_DRIVER_CHART: - print("SKIP") + chart_namespace = output.split()[1] + chart_values = subprocess.getoutput(helm + " -n " + chart_namespace + " get values azurefile-csi-driver -o yaml") + output = subprocess.getoutput(kubectl + " get csidrivers file.csi.azure.com -o yaml") + fsGroupPolicy = yaml.safe_load(output)["spec"]["fsGroupPolicy"] + if chart_values == "null": + chartValuesYaml = {"feature": {}} else: - command = (helm + " -n kube-system upgrade azurefile-csi-driver azurefile-csi-driver" + - " --wait --reset-values --version " + AZUREFILE_CSI_DRIVER_CHART + + chartValuesYaml = yaml.safe_load(chart_values) + if "feature" not in chartValuesYaml: + chartValuesYaml["feature"] = {} + if "fsGroupPolicy" in chartValuesYaml["feature"]: + if chartValuesYaml["feature"]["fsGroupPolicy"] != fsGroupPolicy: + chartValuesYaml["feature"]["fsGroupPolicy"] = fsGroupPolicy + chart_values = yaml.dump(chartValuesYaml) + elif fsGroupPolicy != "ReadWriteOnceWithFSType": + chartValuesYaml["feature"]["fsGroupPolicy"] = fsGroupPolicy + chart_values = yaml.dump(chartValuesYaml) + f = open('./azurefilecsidriver.values', 'w') + f.write(chart_values) + f.close() + if chart_namespace != "kube-system": + print("[INFO] Uninstalling Azurefile CSI driver:", end =" ", flush=True) + command = helm + " -n " + chart_namespace + " uninstall azurefile-csi-driver" + execute_command(command, dry_run) + print("[INFO] Installing Azurefile CSI driver " + AZUREFILE_CSI_DRIVER_CHART + " in kube-system namespace:", end =" ", flush=True) + command = (helm + " -n kube-system install azurefile-csi-driver azurefile-csi-driver" + + " --wait --version " + AZUREFILE_CSI_DRIVER_CHART + " --values ./azurefilecsidriver.values" + " --set controller.podAnnotations.\"cluster-autoscaler\\.kubernetes\\.io/safe-to-evict-local-volumes=socket-dir\\,azure-cred\"" + " --repo https://raw.githubusercontent.com/kubernetes-sigs/azurefile-csi-driver/master/charts") execute_command(command, dry_run) + else: + print("[INFO] Upgrading Azurefile CSI driver to " + AZUREFILE_CSI_DRIVER_CHART + ":", end =" ", flush=True) + if chart_version == AZUREFILE_CSI_DRIVER_CHART: + print("SKIP") + else: + command = (helm + " -n kube-system upgrade azurefile-csi-driver azurefile-csi-driver" + + " --wait --version " + AZUREFILE_CSI_DRIVER_CHART + " --values ./azurefilecsidriver.values" + + " --set controller.podAnnotations.\"cluster-autoscaler\\.kubernetes\\.io/safe-to-evict-local-volumes=socket-dir\\,azure-cred\"" + + " --repo https://raw.githubusercontent.com/kubernetes-sigs/azurefile-csi-driver/master/charts") + execute_command(command, dry_run) else: print("[INFO] Installing Azurefile CSI driver " + AZUREFILE_CSI_DRIVER_CHART + ":", end =" ", flush=True) command = (helm + " -n kube-system install azurefile-csi-driver azurefile-csi-driver" + " --wait --version " + AZUREFILE_CSI_DRIVER_CHART + " --set controller.podAnnotations.\"cluster-autoscaler\\.kubernetes\\.io/safe-to-evict-local-volumes=socket-dir\\,azure-cred\"" + " --repo https://raw.githubusercontent.com/kubernetes-sigs/azurefile-csi-driver/master/charts") + if os.path.isfile('./azurefilecsidriver.values'): + command += " --values ./azurefilecsidriver.values" execute_command(command, dry_run) + if os.path.isfile('./azurefilecsidriver.values'): + os.remove("./azurefilecsidriver.values") # Cloud provider Azure status, output = subprocess.getstatusoutput(helm + " list -A | grep cloud-provider-azure") if status == 0: chart_version = output.split()[8].split("-")[3] chart_namespace = output.split()[1] - chart_values = subprocess.getoutput(helm + " -n " + chart_namespace + " get values cloud-provider-azure -o json") - if not dry_run: + if chart_version == CLOUD_PROVIDER_AZURE_CHART[1:] and chart_namespace == "kube-system": + print("SKIP") + else: + chart_values = subprocess.getoutput(helm + " -n " + chart_namespace + " get values cloud-provider-azure -o json") f = open('./cloudproviderazure.values', 'w') f.write(chart_values) f.close() - - if chart_namespace != "kube-system": print("[INFO] Uninstalling Cloud Provider Azure:", end =" ", flush=True) command = helm + " -n " + chart_namespace + " uninstall cloud-provider-azure" execute_command(command, dry_run) print("[INFO] Installing Cloud Provider Azure " + CLOUD_PROVIDER_AZURE_CHART + " in kube-system namespace:", end =" ", flush=True) command = (helm + " -n kube-system install cloud-provider-azure cloud-provider-azure" + " --wait --version " + CLOUD_PROVIDER_AZURE_CHART + " --values ./cloudproviderazure.values" + + " --set cloudControllerManager.configureCloudRoutes=false" + + " --set cloudControllerManager.replicas=2" + " --repo https://raw.githubusercontent.com/kubernetes-sigs/cloud-provider-azure/master/helm/repo") execute_command(command, dry_run) - else: - print("[INFO] Upgrading Cloud Provider Azure to " + CLOUD_PROVIDER_AZURE_CHART + ":", end =" ", flush=True) - if chart_version == CLOUD_PROVIDER_AZURE_CHART[1:]: - print("SKIP") - else: - command = (helm + " -n kube-system upgrade cloud-provider-azure cloud-provider-azure" + - " --wait --version " + CLOUD_PROVIDER_AZURE_CHART + " --values ./cloudproviderazure.values" + - " --repo https://raw.githubusercontent.com/kubernetes-sigs/cloud-provider-azure/master/helm/repo") - execute_command(command, dry_run) else: - print("[INFO] Installing Cloud Provider Azure " + CLOUD_PROVIDER_AZURE_CHART + " in kube-system namespace:", end =" ", flush=True) + print("[INFO] Installing Cloud Provider Azure " + CLOUD_PROVIDER_AZURE_CHART + ":", end =" ", flush=True) command = (helm + " -n kube-system install cloud-provider-azure cloud-provider-azure" + - " --wait --version " + CLOUD_PROVIDER_AZURE_CHART + " --values ./cloudproviderazure.values" + + " --wait --version " + CLOUD_PROVIDER_AZURE_CHART + + " --set cloudControllerManager.configureCloudRoutes=false" + + " --set cloudControllerManager.replicas=2" + " --repo https://raw.githubusercontent.com/kubernetes-sigs/cloud-provider-azure/master/helm/repo") + if os.path.isfile('./cloudproviderazure.values'): + command += " --values ./cloudproviderazure.values" + else: + podsCidrBlock = "192.168.0.0/16" + if "networks" in cluster["spec"]: + if "pods_cidr" in cluster["spec"]["networks"]: + podsCidrBlock = cluster["spec"]["networks"]["pods_cidr"] + command += " --set infra.clusterName=" + cluster_name + " --set 'cloudControllerManager.clusterCIDR=" + podsCidrBlock + "'" execute_command(command, dry_run) - - if not dry_run: + if os.path.isfile('./cloudproviderazure.values'): os.remove("./cloudproviderazure.values") def upgrade_calico(dry_run): @@ -470,28 +552,24 @@ def upgrade_calico(dry_run): print("FAILED (" + output + ")") sys.exit(1) - # Get the current calico values - values = subprocess.getoutput(helm + " -n tigera-operator get values calico -o json") - values = values.replace("v3.25.1", CALICO_VERSION) - values = values.replace("v1.29.3", CALICO_NODE_VERSION) - values = values.replace('"podAnnotations":{}', '"podAnnotations":{"cluster-autoscaler.kubernetes.io/safe-to-evict-local-volumes": "var-lib-calico"}') - - # Write calico values to file - if not dry_run: - calico_values = open('./calico.values', 'w') - calico_values.write(values) - calico_values.close() - print("[INFO] Upgrading Calico to " + CALICO_VERSION + ":", end =" ", flush=True) if chart_version == CALICO_VERSION: print("SKIP") else: + # Get the current calico values + values = subprocess.getoutput(helm + " -n tigera-operator get values calico -o json") + values = values.replace("v3.25.1", CALICO_VERSION) + values = values.replace("v1.29.3", CALICO_NODE_VERSION) + values = values.replace('"podAnnotations":{}', '"podAnnotations":{"cluster-autoscaler.kubernetes.io/safe-to-evict-local-volumes": "var-lib-calico"}') + + # Write calico values to file + calico_values = open('./calico.values', 'w') + calico_values.write(values) + calico_values.close() command = (helm + " -n tigera-operator upgrade calico tigera-operator" + - " --wait --version " + CALICO_VERSION + " --values ./calico.values" + + " --wait --wait-for-jobs --version " + CALICO_VERSION + " --values ./calico.values" + " --repo https://docs.projectcalico.org/charts") execute_command(command, dry_run) - - if not dry_run: os.remove("./calico.values") def install_cluster_operator(helm_repo, keos_registry, docker_registries, dry_run): @@ -701,6 +779,7 @@ def request_confirmation(): if "github_token" in data["secrets"]: env_vars += " GITHUB_TOKEN=" + data["secrets"]["github_token"] helm = "GITHUB_TOKEN=" + data["secrets"]["github_token"] + " " + helm + kubectl = "GITHUB_TOKEN=" + data["secrets"]["github_token"] + " " + kubectl # Set helm repo helm_repo["url"] = config["helm_repo"] @@ -740,7 +819,7 @@ def request_confirmation(): request_confirmation() if (config["all"] or config["only_drivers"]) and provider == "azure": - upgrade_drivers(config["dry_run"]) + upgrade_drivers(cluster, cluster_name, config["dry_run"]) if not config["yes"]: request_confirmation() @@ -761,8 +840,8 @@ def request_confirmation(): if config["all"] or config["only_cluster_operator_descriptor"]: create_cluster_operator_descriptor(cluster, cluster_name, helm_repo, config["dry_run"]) - if not config["yes"]: - request_confirmation() if not config["disable_prepare_capsule"]: restore_capsule(config["dry_run"]) + + print("[INFO] Upgrade process finished successfully")