From 47a862da8d9f85d7f75d634265e1b2f1e5025706 Mon Sep 17 00:00:00 2001 From: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> Date: Fri, 1 Apr 2022 15:37:44 -0400 Subject: [PATCH] [VCDA-3330 and VCDA-3343] Install Tanzu Core packages and read versions from extra config (#1329) * added core pkg in cloud init script Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * kapp controller draft Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * removed taints, will try worker Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * moved core pkg logic to worker 0 Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * working cluster creation Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * now working creation Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * kapp success check Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * updating core pkg in rde Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * updating rde with core pkg Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * removed commented cloud init code Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * removed debug output Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * addressed review comments Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * install kapp on worker 0 and metrics server on nth worker Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * bug fix Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * installing core pkgs when 0 worker nodes before resize Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * not waiting for tanzu package install Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * addressed review comments Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> * fixed renaming Signed-off-by: ltimothy7 <66969084+ltimothy7@users.noreply.github.com> --- .../v2_x_tkgm/cloud_init_control_plane.yaml | 149 ++++++++------- .../v2_x_tkgm/cloud_init_node.yaml | 108 +++++++++++ .../common/constants/server_constants.py | 23 +++ .../rde/backend/cluster_service_2_x_tkgm.py | 169 +++++++++++++++--- .../rde/models/rde_2_1_0.py | 3 +- cse_def_schema/schema_2_1_0.json | 4 - 6 files changed, 364 insertions(+), 92 deletions(-) diff --git a/cluster_scripts/v2_x_tkgm/cloud_init_control_plane.yaml b/cluster_scripts/v2_x_tkgm/cloud_init_control_plane.yaml index 9ac828273..3abe7a6e9 100644 --- a/cluster_scripts/v2_x_tkgm/cloud_init_control_plane.yaml +++ b/cluster_scripts/v2_x_tkgm/cloud_init_control_plane.yaml @@ -96,6 +96,7 @@ write_files: csi_driver_path=/root/csi-driver.yaml csi_controller_path=/root/csi-controller.yaml csi_node_path=/root/csi-node.yaml + kapp_controller_path=/root/kapp-controller.yaml vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status in_progress" echo 'net.ipv6.conf.all.disable_ipv6 = 1' >> /etc/sysctl.conf @@ -140,6 +141,84 @@ write_files: systemctl restart containerd vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status successful" + # openbracket(all caps) will be replaced by the open bracket and closebracket (all caps) + # will be replaced by an open bracket. + # This convention is needed so that python's template format function does not view the bash + # $\openbracket/VAR/\closebracket as a format variable that will be replaced by the python format function. + antrea_version="{antrea_version}" + kapp_controller_version="" + metrics_server_version="" + metrics_server_version_valid=true + vmtoolsd --cmd "info-set guestinfo.postcustomization.tkr.get_versions.status in_progress" + tkr_bom_dir=/tmp/tkr_bom + bom_path=$tkr_bom_dir/bom + mkdir -p $bom_path + components_path=$bom_path/components.yaml + imgpkg_path=$tkr_bom_dir/imgpkg + yq_path=$tkr_bom_dir/yq + default_antrea_version="0.11.3" + + xml_version_property=$(vmtoolsd --cmd "info-get guestinfo.ovfenv" | grep "oe:key=\"VERSION\"") + init_k8s_version=$(echo $xml_version_property | sed 's/.*oe:value=\"//; s/\(.*\)-.*/\1/') + k8s_version=$(echo $init_k8s_version | tr -s "+" "_") + + # download imgpkg, which is needed for getting the components yaml file + wget -nv github.com/vmware-tanzu/carvel-imgpkg/releases/download/v0.24.0/imgpkg-linux-amd64 -O $imgpkg_path + chmod +x $imgpkg_path + + # We need to loop through the `X` value in `tkg.X` because of some TKR unexpected design. + # We increment `X`, looking for a valid tkr bom version. + no_tkr_found=false + until $imgpkg_path pull -i projects.registry.vmware.com/tkg/tkr-bom:$OPENBRACKETk8s_versionCLOSEBRACKET -o $bom_path + do + tkg_version=$(echo $OPENBRACKETk8s_version//*.CLOSEBRACKET) + tkg_version=$((tkg_version+1)) + k8s_version=$(echo $k8s_version | sed "s/.$/"$tkg_version"/") + if [[ $tkg_version -gt 10 ]]; then + no_tkr_found=true + break + fi + done + + mv $bom_path/*.yaml $components_path + # download yq for yaml parsing + wget https://github.com/mikefarah/yq/releases/download/v4.2.0/yq_linux_amd64 -O $yq_path + chmod +x $yq_path + + # handle getting antrea version + if [[ -z "$antrea_version" ]]; then + if [[ "$no_tkr_found" = true ]] ; then + echo "no tkr bom found, will use default component versions" &>> /var/log/cse/customization/status.log + antrea_version=$default_antrea_version + else + # will get antrea version from tkr file + antrea_version=$($yq_path e ".components.antrea[0].version" $components_path | sed 's/+.*//') + if [[ -z "$antrea_version" ]] || [[ "$antrea_version" = "null" ]] || [[ "$antrea_version" = "false" ]]; then + antrea_version=$default_antrea_version + else + antrea_version=$(echo $antrea_version | sed "s/v//") # remove leading `v`, which will be added later + fi + fi + fi + + # get kapp-controller and metrics-server versions, which will be installed on the worker nodes + # These versions are retrieved here since the antrea version is already retrieved and installed + # on the control plane node, so this avoids retrieving core package versions later + kapp_controller_version=$($yq_path e ".components.kapp-controller[0].version" $components_path | sed 's/v//') + metrics_server_version=$($yq_path e ".components.metrics-server[0].version" $components_path | sed 's/v//') + if [[ -z "$metrics_server_version" ]] || [[ "$metrics_server_version" = "null" ]] || [[ "$metrics_server_version" = "false" ]]; then + metrics_server_version_valid=false + echo "metrics server version not valid" >> /var/log/cse/customization/status.log + fi + + # store tkr versions in extra config + vmtoolsd --cmd "info-set guestinfo.postcustomization.tkr.get_versions.kapp_controller $kapp_controller_version" + vmtoolsd --cmd "info-set guestinfo.postcustomization.tkr.get_versions.metrics_server $metrics_server_version" + + # cleanup components downloads + rm -rf $tkr_bom_dir + vmtoolsd --cmd "info-set guestinfo.postcustomization.tkr.get_versions.status successful" + vmtoolsd --cmd "info-set guestinfo.postcustomization.kubeinit.status in_progress" # tag images coredns_image_version="" @@ -167,74 +246,13 @@ write_files: vmtoolsd --cmd "info-set guestinfo.kubeconfig $(cat /etc/kubernetes/admin.conf | base64 | tr -d '\n')" vmtoolsd --cmd "info-set guestinfo.postcustomization.kubeinit.status successful" - # open_bracket(all caps) will be replaced by the open bracket and close_bracket (all caps) - # will be replaced by an open bracket. - # This convention is needed so that python's template format function does not view the bash - # $\open_bracket/VAR/\close_bracket as a format variable that will be replaced by the python format function. - antrea_version="{antrea_version}" - vmtoolsd --cmd "info-set guestinfo.postcustomization.tkr.get_versions.status in_progress" - if [[ -z "$antrea_version" ]]; then - tkr_bom_dir=/tmp/tkr_bom - bom_path=$tkr_bom_dir/bom - mkdir -p $bom_path - components_path=$bom_path/components.yaml - imgpkg_path=$tkr_bom_dir/imgpkg - yq_path=$tkr_bom_dir/yq - default_antrea_version="0.11.3" - - xml_version_property=$(vmtoolsd --cmd "info-get guestinfo.ovfenv" | grep "oe:key=\"VERSION\"") - init_k8s_version=$(echo $xml_version_property | sed 's/.*oe:value=\"//; s/\(.*\)-.*/\1/') - k8s_version=$(echo $init_k8s_version | tr -s "+" "_") - - # install imgpkg, which is needed for getting the components yaml file - wget -nv github.com/vmware-tanzu/carvel-imgpkg/releases/download/v0.24.0/imgpkg-linux-amd64 -O $imgpkg_path - chmod +x $imgpkg_path - - # We need to loop through the `X` value in `tkg.X` because of some TKR unexpected design. - # We increment `X`, looking fir a valid tkr bom version. - no_tkr_found=false - until $imgpkg_path pull -i projects.registry.vmware.com/tkg/tkr-bom:$OPEN_BRACKETk8s_versionCLOSE_BRACKET -o $bom_path - do - tkg_version=$(echo $OPEN_BRACKETk8s_version//*.CLOSE_BRACKET) - tkg_version=$((tkg_version+1)) - k8s_version=$(echo $k8s_version | sed "s/.$/"$tkg_version"/") - if [[ $tkg_version -gt 10 ]]; then - no_tkr_found=true - break - fi - done - - if [[ "$no_tkr_found" = true ]] ; then - echo "no tkr bom found, will use default component versions" &>> /var/log/cse/customization/status.log - antrea_version=$default_antrea_version - fi - - if [[ "$no_tkr_found" = false ]]; then - mv $bom_path/*.yaml $components_path - - # install yq for yaml parsing - wget https://github.com/mikefarah/yq/releases/download/v4.2.0/yq_linux_amd64 -O $yq_path - chmod +x $yq_path - antrea_version=$($yq_path e ".components.antrea[0].version" $components_path | sed 's/+.*//') - if [[ -z "$antrea_version" ]] || [[ "$antrea_version" = "null" ]] || [[ "$antrea_version" = "false" ]]; then - antrea_version=$default_antrea_version - echo "no antrea version found in tkr bom, will use default antrea version: $OPEN_BRACKETdefault_antrea_versionCLOSE_BRACKET" &>> /var/log/cse/customization/status.log - else - antrea_version=$(echo $antrea_version | sed "s/v//") # remove leading `v`, which will be added later - fi - fi - - # cleanup components downloads - rm -rf $tkr_bom_dir - fi - vmtoolsd --cmd "info-set guestinfo.postcustomization.tkr.get_versions.status successful" - vmtoolsd --cmd "info-set guestinfo.postcustomization.kubectl.cni.install.status in_progress" - antrea_path=/root/antrea-$OPEN_BRACKETantrea_versionCLOSE_BRACKET.yaml - wget -O $antrea_path https://github.com/vmware-tanzu/antrea/releases/download/v$OPEN_BRACKETantrea_versionCLOSE_BRACKET/antrea.yml + antrea_path=/root/antrea-$OPENBRACKETantrea_versionCLOSEBRACKET.yaml + wget -O $antrea_path https://github.com/vmware-tanzu/antrea/releases/download/v$OPENBRACKETantrea_versionCLOSEBRACKET/antrea.yml # This does not need to be done from v0.12.0 onwards inclusive - sed -i "s/image: antrea\/antrea-ubuntu:v$OPEN_BRACKETantrea_versionCLOSE_BRACKET/image: projects.registry.vmware.com\/antrea\/antrea-ubuntu:v$OPEN_BRACKETantrea_versionCLOSE_BRACKET/g" $antrea_path + sed -i "s/image: antrea\/antrea-ubuntu:v$OPENBRACKETantrea_versionCLOSEBRACKET/image: projects.registry.vmware.com\/antrea\/antrea-ubuntu:v$OPENBRACKETantrea_versionCLOSEBRACKET/g" $antrea_path kubectl apply -f $antrea_path + vmtoolsd --cmd "info-set guestinfo.postcustomization.core_packages.antrea_version $antrea_version" vmtoolsd --cmd "info-set guestinfo.postcustomization.kubectl.cni.install.status successful" @@ -280,6 +298,7 @@ write_files: fi vmtoolsd --cmd "info-set guestinfo.postcustomization.kubectl.default_storage_class.status successful" + vmtoolsd --cmd "info-set guestinfo.postcustomization.kubeadm.token.generate.status in_progress" kubeadm_join_info=$(kubeadm token create --print-join-command --ttl 0 2> /dev/null) vmtoolsd --cmd "info-set guestinfo.postcustomization.kubeadm.token.info $kubeadm_join_info" diff --git a/cluster_scripts/v2_x_tkgm/cloud_init_node.yaml b/cluster_scripts/v2_x_tkgm/cloud_init_node.yaml index 76683c676..34c92b085 100644 --- a/cluster_scripts/v2_x_tkgm/cloud_init_node.yaml +++ b/cluster_scripts/v2_x_tkgm/cloud_init_node.yaml @@ -25,6 +25,14 @@ write_files: #!/usr/bin/env bash catch() {{ + kubeconfig_path=/root/kubeconfig.yaml + if [[ -f "$kubeconfig_path" ]]; then + rm $kubeconfig_path + fi + # ensure kubeconfig is null, even if this worker doesn't use the kubeconfig to avoid + # getting the config if the value is not set + vmtoolsd --cmd "info-set guestinfo.postcustomization.control_plane.kubeconfig null" + vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $?" error_message="$(date) $(caller): $BASH_COMMAND" echo "$error_message" &>> /var/log/cse/customization/error.log @@ -93,6 +101,106 @@ write_files: kubeadm join --config /root/kubeadm-defaults-join.conf --v=10 &> /root/kubeadm-join.out vmtoolsd --cmd "info-set guestinfo.postcustomization.kubeadm.node.join.status successful" + # openbracket(all caps) will be replaced by the open bracket and closebracket (all caps) + # will be replaced by an open bracket. + # This convention is needed so that python's template format function does not view the bash + # $\openbracket/VAR/\closebracket as a format variable that will be replaced by the python format function. + vmtoolsd --cmd "info-set guestinfo.postcustomization.core_packages.attempted_install in_progress" + install_kapp_controller={install_kapp_controller} + kubeconfig_path=/root/kubeconfig.yaml + touch $kubeconfig_path + # Kapp-controller is installed on the first worker node + kapp_controller_version="{kapp_controller_version}" + kapp_controller_version=$(echo $kapp_controller_version | sed 's/+.*//' | sed 's/v//') + install_tanzu_cli_packages={install_tanzu_cli_packages} + if [[ "$install_kapp_controller" = true ]]; then + vmtoolsd --cmd "info-get guestinfo.postcustomization.control_plane.kubeconfig" > $kubeconfig_path + if [[ "$install_tanzu_cli_packages" = false ]]; then + # clear extra config if it won't be used again to avoid leaking it + vmtoolsd --cmd "info-set guestinfo.postcustomization.control_plane.kubeconfig null" + fi + export KUBECONFIG=$kubeconfig_path + + # install kapp-controller, which is needed for tanzu-cli + kapp_controller_installed=false + if [[ ! -z "$kapp_controller_version" && $kapp_controller_version != "null" ]]; then + kubectl apply -f https://github.com/vmware-tanzu/carvel-kapp-controller/releases/download/v$OPENBRACKETkapp_controller_versionCLOSEBRACKET/release.yml + fi + fi + + # Metrics server (currently the only tanzu cli installed package) is installed on the last worker node + tanzu_cli_installed=false + if [[ "$install_tanzu_cli_packages" = true ]]; then + vmtoolsd --cmd "info-get guestinfo.postcustomization.control_plane.kubeconfig" > $kubeconfig_path + # clear extra config to avoid leaking it + vmtoolsd --cmd "info-set guestinfo.postcustomization.control_plane.kubeconfig null" + export KUBECONFIG=$kubeconfig_path + metrics_server_version="" + + # Wait for kapp-controller to be ready for at most 8 minutes to be running so that tanzu cli can be fully + # functional for our purposes + kapp_controller_pod=$(kubectl get pods -l=app='kapp-controller' -A -o jsonpath='OPENBRACKET.items[*].metadata.nameCLOSEBRACKET') + kapp_controller_namespace=$(kubectl get pods -l=app='kapp-controller' -A -o jsonpath='OPENBRACKET.items[*].metadata.namespaceCLOSEBRACKET') + kapp_controller_ready_path=/root/kapp_controller_ready.txt + kapp_controller_ready=false + kubectl wait --for=condition=Ready pod/$OPENBRACKETkapp_controller_podCLOSEBRACKET -n $kapp_controller_namespace --timeout=8m > $kapp_controller_ready_path + if [[ -f "$kapp_controller_ready_path" && -s $kapp_controller_ready_path ]]; then + kapp_controller_ready=true + else + kapp_controller_version="" + fi + + if [[ "$kapp_controller_ready" = true ]]; then + # install tanzu cli + tanzu_path=/root/tanzu + mkdir $tanzu_path + tanzu_tar_path=$tanzu_path/tanzu_cli.tar.gz + wget https://github.com/vmware-tanzu/tanzu-framework/releases/download/v0.17.0/tanzu-cli-linux-amd64.tar.gz -O $tanzu_tar_path + tar -zxvf $tanzu_tar_path -C $tanzu_path + sudo install $OPENBRACKETtanzu_pathCLOSEBRACKET/v0.17.0/tanzu-core-linux_amd64 /usr/local/bin/tanzu + export HOME=/root + tanzu plugin install package + + xml_version_property=$(vmtoolsd --cmd "info-get guestinfo.ovfenv" | grep "oe:key=\"VERSION\"") + init_k8s_version=$(echo $xml_version_property | sed 's/.*oe:value=\"//; s/\(.*\)-.*/\1/') + k8s_version=$(echo $init_k8s_version | tr -s "+" "_") + export KUBECONFIG=$kubeconfig_path + tanzu package repository add tanzu-core --namespace tkg-system --create-namespace --url projects.registry.vmware.com/tkg/packages/core/repo:$OPENBRACKETk8s_versionCLOSEBRACKET + + # wait for metrics server to be available + metrics_server_info_str=$(tanzu package available list -A | grep metrics-server) + num_metrics_server_loops=0 + while [[ -z "$metrics_server_info_str" ]]; do + sleep 15 + ((num_metrics_server_loops++)) + if [[ $num_metrics_server_loops -gt 20 ]]; then # max 5 minutes + break + fi + metrics_server_info_str=$(tanzu package available list -A | grep metrics-server) + done + + # install metrics server + metrics_server_version=$(echo $metrics_server_info_str | sed -n 's/^.*\([0-9]\+\.[0-9]\+\.[0-9]\++vmware.[0-9]\+-tkg.[0-9]\+\).*$/\1/p') + if [[ ! -z "$metrics_server_version" && $metrics_server_version != "null" ]]; then + # similar to other k8s packages, we are not waiting in order to avoid + # timeout issues crashing the cluster creation + tanzu package install metrics-server --namespace tkg-system --create-namespace --package-name metrics-server.tanzu.vmware.com --version $metrics_server_version --wait=false + fi + + if [[ -z "$kapp_controller_version" ]]; then + kapp_controller_version="null" + fi + vmtoolsd --cmd "info-set guestinfo.postcustomization.core_packages.kapp_controller_version $kapp_controller_version" + if [[ -z "$metrics_server_version" ]]; then + metrics_server_version="null" + fi + vmtoolsd --cmd "info-set guestinfo.postcustomization.core_packages.metrics_server_version $metrics_server_version" + fi + + rm $kubeconfig_path + fi + vmtoolsd --cmd "info-set guestinfo.postcustomization.core_packages.attempted_install successful" + echo "$(date) post customization script execution completed" &>> /var/log/cse/customization/status.log exit 0 diff --git a/container_service_extension/common/constants/server_constants.py b/container_service_extension/common/constants/server_constants.py index d0ff54c2d..8d590e814 100644 --- a/container_service_extension/common/constants/server_constants.py +++ b/container_service_extension/common/constants/server_constants.py @@ -810,11 +810,34 @@ class PostCustomizationPhase(Enum): KUBECTL_APPLY_CPI = 'guestinfo.postcustomization.kubectl.cpi.install.status' # noqa: E501 KUBECTL_APPLY_CSI = 'guestinfo.postcustomization.kubectl.csi.install.status' # noqa: E501 KUBECTL_APPLY_DEFAULT_STORAGE_CLASS = 'guestinfo.postcustomization.kubectl.default_storage_class.status' # noqa: E501 + KUBECTL_APPLY_KAPP_CONTROLLER = 'guestinfo.postcustomization.kubectl.kapp_controller.install' # noqa: E501 KUBEADM_TOKEN_GENERATE = 'guestinfo.postcustomization.kubeadm.token.generate.status' # noqa: E501 KUBEADM_NODE_JOIN = 'guestinfo.postcustomization.kubeadm.node.join.status' PROXY_SETTING = 'guestinfo.postcustomization.proxy.setting.status' + CORE_PACKAGES_ATTEMPTED_INSTALL = 'guestinfo.postcustomization.core_packages.attempted_install' # noqa: E501 +# TO_INSTALL versions indicate versions that the control plane node retrieved +# for worker node(s) to install. INSTALLED_VERSION refers to the version +# that the worker node(s) were able to install. +@unique +class PostCustomizationVersions(Enum): + TKR_KAPP_CONTROLLER_VERSION_TO_INSTALL = 'guestinfo.postcustomization.tkr.get_versions.kapp_controller' # noqa: E501 + TKR_METRICS_SERVER_VERSION_TO_INSTALL = 'guestinfo.postcustomization.tkr.get_versions.metrics_server' # noqa: E501 + INSTALLED_VERSION_OF_KAPP_CONTROLLER = 'guestinfo.postcustomization.core_packages.kapp_controller_version' # noqa: E501 + INSTALLED_VERSION_OF_METRICS_SERVER = 'guestinfo.postcustomization.core_packages.metrics_server_version' # noqa: E501 + INSTALLED_VERSION_OF_ANTREA = 'guestinfo.postcustomization.core_packages.antrea_version' # noqa: E501 + + +@unique +class CorePkgVersionKeys(Enum): + KAPP_CONTROLLER = 'kapp-controller' + METRICS_SERVER = 'metrics-server' + ANTREA = 'antrea' + + +PostCustomizationKubeconfig = 'guestinfo.postcustomization.control_plane.kubeconfig' # noqa: E501 + KUBEADM_TOKEN_INFO = 'guestinfo.postcustomization.kubeadm.token.info' KUBE_CONFIG = 'guestinfo.kubeconfig' POST_CUSTOMIZATION_SCRIPT_EXECUTION_STATUS = 'guestinfo.post_customization_script_execution_status' # noqa: E501 diff --git a/container_service_extension/rde/backend/cluster_service_2_x_tkgm.py b/container_service_extension/rde/backend/cluster_service_2_x_tkgm.py index 9f0be714d..4d3b591cb 100644 --- a/container_service_extension/rde/backend/cluster_service_2_x_tkgm.py +++ b/container_service_extension/rde/backend/cluster_service_2_x_tkgm.py @@ -27,7 +27,9 @@ CPI_NAME, \ CSI_DEFAULT_VERSION, \ CSI_NAME, \ - DISK_ENABLE_UUID + DISK_ENABLE_UUID, \ + PostCustomizationKubeconfig, \ + CorePkgVersionKeys from container_service_extension.common.constants.server_constants import ClusterMetadataKey # noqa: E501 from container_service_extension.common.constants.server_constants import ClusterScriptFile # noqa: E501 from container_service_extension.common.constants.server_constants import DefEntityOperation # noqa: E501 @@ -38,6 +40,7 @@ from container_service_extension.common.constants.server_constants import LocalTemplateKey # noqa: E501 from container_service_extension.common.constants.server_constants import NodeType # noqa: E501 from container_service_extension.common.constants.server_constants import PostCustomizationPhase # noqa: E501 +from container_service_extension.common.constants.server_constants import PostCustomizationVersions # noqa: E501 from container_service_extension.common.constants.server_constants import ThreadLocalData # noqa: E501 from container_service_extension.common.constants.server_constants import TKGM_DEFAULT_POD_NETWORK_CIDR # noqa: E501 from container_service_extension.common.constants.server_constants import TKGM_DEFAULT_SERVICE_CIDR # noqa: E501 @@ -882,7 +885,10 @@ def _create_cluster_async(self, cluster_id: str, vapp.reload() try: - expose_ip, _ = _add_control_plane_nodes( + # antrea will be installed on the first control plane node. + # kapp controller and metrics server will be installed on + # the worker nodes. + expose_ip, _, core_pkg_versions = _add_control_plane_nodes( sysadmin_client_v36, user_client=self.context.client, num_nodes=1, @@ -928,8 +934,13 @@ def _create_cluster_async(self, cluster_id: str, f"'{cluster_name}' ({cluster_id})" LOGGER.debug(msg) self._update_task(BehaviorTaskStatus.RUNNING, message=msg) + cni_version = core_pkg_versions.get(CorePkgVersionKeys.ANTREA.value) # noqa: E501 + # because antrea is already installed, remove it from the core pkg + # dictionary so that it is not installed + if cni_version: + del core_pkg_versions[CorePkgVersionKeys.ANTREA.value] try: - _add_worker_nodes( + _, installed_core_pkg_versions = _add_worker_nodes( sysadmin_client_v36, num_nodes=num_workers, org=org, @@ -943,7 +954,8 @@ def _create_cluster_async(self, cluster_id: str, sizing_class_name=worker_sizing_class, cpu_count=worker_cpu_count, memory_mb=worker_memory_mb, - control_plane_join_cmd=control_plane_join_cmd + control_plane_join_cmd=control_plane_join_cmd, + core_pkg_versions_to_install=core_pkg_versions ) except Exception as err: LOGGER.error(err, exc_info=True) @@ -975,6 +987,10 @@ def _create_cluster_async(self, cluster_id: str, # csi `default` field; we will need to look into the spec and we # may need to validate if there is only one default csi csi_elem_rde_status_value.default = True + + # get installed core pkg versions + installed_kapp_controller_version = installed_core_pkg_versions.get(CorePkgVersionKeys.KAPP_CONTROLLER.value, "") # noqa: E501 + installed_metrics_server_version = installed_core_pkg_versions.get(CorePkgVersionKeys.METRICS_SERVER.value, "") # noqa: E501 changes = { 'entity.status.private': rde_2_x.Private( kube_token=control_plane_join_cmd, @@ -998,7 +1014,9 @@ def _create_cluster_async(self, cluster_id: str, 'entity.status.cni': f"{CNI_NAME} {cni_version}", 'entity.status.cpi.name': CPI_NAME, 'entity.status.cpi.version': cpi_version, - 'entity.status.csi': [csi_elem_rde_status_value] + 'entity.status.csi': [csi_elem_rde_status_value], + 'entity.status.tkg_core_packages.kapp_controller': installed_kapp_controller_version, # noqa: E501 + 'entity.status.tkg_core_packages.metrics_server': installed_metrics_server_version # noqa: E501 } # Update status with exposed ip @@ -1381,7 +1399,16 @@ def _create_nodes_async(self, input_native_entity: rde_2_x.NativeEntity): shared_constants.RDEProperty.KUBE_TOKEN.value): # noqa: E501 control_plane_join_cmd = curr_native_entity.status.private.kube_token # noqa: E501 - _add_worker_nodes( + # If the cluster currently only has no worker nodes, then + # resizing the cluster will add the core packages + installed_core_pkg_versions = None + if curr_worker_count == 0: + control_plane_vm = _get_control_plane_vm(sysadmin_client_v36, vapp) # noqa: E501 + core_pkg_versions = _get_core_pkg_versions(control_plane_vm) # noqa: E501 + # remove antrea since it is already installed + if core_pkg_versions.get(CorePkgVersionKeys.ANTREA.value): + del core_pkg_versions[CorePkgVersionKeys.ANTREA.value] + _, installed_core_pkg_versions = _add_worker_nodes( sysadmin_client_v36, num_nodes=num_workers_to_add, org=org, @@ -1395,12 +1422,32 @@ def _create_nodes_async(self, input_native_entity: rde_2_x.NativeEntity): sizing_class_name=worker_sizing_class, cpu_count=worker_cpu_count, memory_mb=worker_memory_mb, - control_plane_join_cmd=control_plane_join_cmd + control_plane_join_cmd=control_plane_join_cmd, + core_pkg_versions_to_install=core_pkg_versions ) msg = f"Added {num_workers_to_add} node(s) to cluster " \ f"{cluster_name}({cluster_id})" self._update_task(BehaviorTaskStatus.RUNNING, message=msg) + + # handle updating entity with core package info + if installed_core_pkg_versions and len(installed_core_pkg_versions) > 0: # noqa: E501 + changes = {} + installed_kapp_controller_version = installed_core_pkg_versions.get( # noqa: E501 + CorePkgVersionKeys.KAPP_CONTROLLER.value, "") + if installed_kapp_controller_version: + changes['entity.status.tkg_core_packages.kapp_controller'] = installed_kapp_controller_version # noqa: E501 + installed_metrics_server_version = installed_core_pkg_versions.get( # noqa: E501 + CorePkgVersionKeys.METRICS_SERVER.value, "") + if installed_metrics_server_version: + changes['entity.status.tkg_core_packages.metrics_server'] = installed_metrics_server_version # noqa: E501 + if len(changes) > 0: + self._update_cluster_entity( + cluster_id, + changes=changes, + external_id=vapp_href + ) + msg = f"Created {num_workers_to_add} workers for '{cluster_name}' ({cluster_id}) " # noqa: E501 self._update_task(BehaviorTaskStatus.RUNNING, message=msg) except (exceptions.NodeCreationError, exceptions.ClusterJoiningError) as err: # noqa: E501 @@ -2259,7 +2306,7 @@ def _add_control_plane_nodes( dsc_storage_profile_name=None, dsc_k8s_storage_class_name=None, dsc_filesystem=None, - dsc_use_delete_reclaim_policy=False) -> Tuple[str, List[Dict]]: + dsc_use_delete_reclaim_policy=False) -> Tuple[str, List[Dict], Dict]: vcd_utils.raise_error_if_user_not_from_system_org(sysadmin_client) @@ -2343,6 +2390,7 @@ def _add_control_plane_nodes( spec = vm_specs[0] internal_ip = vapp.get_primary_ip(vm_name=spec['target_vm_name']) + core_pkg_versions = None for spec in vm_specs: vm_name = spec['target_vm_name'] vm_resource = vapp.get_vm(vm_name) @@ -2408,8 +2456,8 @@ def _add_control_plane_nodes( # place bash open and close brackets after the python template # function - cloud_init_spec = cloud_init_spec.replace("OPEN_BRACKET", "{") - cloud_init_spec = cloud_init_spec.replace("CLOSE_BRACKET", "}") + cloud_init_spec = cloud_init_spec.replace("OPENBRACKET", "{") + cloud_init_spec = cloud_init_spec.replace("CLOSEBRACKET", "}") # create a cloud-init spec and update the VMs with it _set_cloud_init_spec(sysadmin_client, vapp, vm, cloud_init_spec) @@ -2427,8 +2475,8 @@ def _add_control_plane_nodes( PostCustomizationPhase.NETWORK_CONFIGURATION, PostCustomizationPhase.STORE_SSH_KEY, PostCustomizationPhase.PROXY_SETTING, - PostCustomizationPhase.KUBEADM_INIT, PostCustomizationPhase.TKR_GET_VERSIONS, + PostCustomizationPhase.KUBEADM_INIT, PostCustomizationPhase.KUBECTL_APPLY_CNI, PostCustomizationPhase.KUBECTL_APPLY_CPI, PostCustomizationPhase.KUBECTL_APPLY_CSI, @@ -2450,6 +2498,8 @@ def _add_control_plane_nodes( ) vapp.reload() + core_pkg_versions = _get_core_pkg_versions(vm) + except Exception as err: LOGGER.error(err, exc_info=True) node_list = [entry.get('target_vm_name') for entry in vm_specs] @@ -2461,24 +2511,43 @@ def _add_control_plane_nodes( raise exceptions.NodeCreationError(node_list, str(err)) - return expose_ip, vm_specs + return expose_ip, vm_specs, core_pkg_versions + + +def _get_core_pkg_versions(control_plane_vm: vcd_vm.VM) -> Dict: + # the values of the dictionary will be None if the key does not exist + # in the vm extra config + core_pkg_versions = { + CorePkgVersionKeys.KAPP_CONTROLLER.value: vcd_utils.get_vm_extra_config_element( # noqa: E501 + control_plane_vm, + PostCustomizationVersions.TKR_KAPP_CONTROLLER_VERSION_TO_INSTALL.value), # noqa: E501 + CorePkgVersionKeys.ANTREA.value: vcd_utils.get_vm_extra_config_element( + control_plane_vm, + PostCustomizationVersions.INSTALLED_VERSION_OF_ANTREA.value) + } + return core_pkg_versions def _add_worker_nodes(sysadmin_client, num_nodes, org, vdc, vapp, catalog_name, template, network_name, storage_profile=None, ssh_key=None, sizing_class_name=None, cpu_count=None, memory_mb=None, - control_plane_join_cmd='') -> List: + control_plane_join_cmd='', + core_pkg_versions_to_install=None) -> Tuple[List, Dict]: vcd_utils.raise_error_if_user_not_from_system_org(sysadmin_client) + if not core_pkg_versions_to_install: + core_pkg_versions_to_install = {} + if (cpu_count or memory_mb) and sizing_class_name: raise exceptions.BadRequestError("Cannot specify both cpu/memory and " "sizing class for control plane " "node creation") vm_specs = [] + installed_core_pkg_versions = {} if num_nodes <= 0: - return vm_specs + return vm_specs, installed_core_pkg_versions try: templated_script = get_cluster_script_file_contents( @@ -2516,16 +2585,35 @@ def _add_worker_nodes(sysadmin_client, num_nodes, org, vdc, vapp, sizing_class_name=sizing_class_name, cust_script=None, ) - for spec in vm_specs: - spec['cloudinit_node_spec'] = templated_script.format( + + num_vm_specs = len(vm_specs) + for ind in range(num_vm_specs): + spec = vm_specs[ind] + # kapp controller is installed on the 0th worker node + # tanzu cli and metrics server will be installed on the last + # worker node in order to allow time for the kapp controller pod + # to be ready + to_install_tkr_kapp_controller_version = core_pkg_versions_to_install.get(CorePkgVersionKeys.KAPP_CONTROLLER.value, "") # noqa: E501 + should_install_kapp_controller = (ind == 0) and to_install_tkr_kapp_controller_version # noqa: E501 + should_use_kapp_controller_version = ((ind == 0) or (ind == num_vm_specs - 1)) and to_install_tkr_kapp_controller_version # noqa: E501 + should_install_tanzu_cli_packages = (ind == num_vm_specs - 1) and len(core_pkg_versions_to_install) > 0 # noqa: E501 + formatted_script = templated_script.format( vm_host_name=spec['target_vm_name'], ssh_key=ssh_key if ssh_key else '', ip_port=ip_port, token=token, discovery_token_ca_cert_hash=discovery_token_ca_cert_hash, + install_kapp_controller="true" if should_install_kapp_controller else "false", # noqa: E501 + kapp_controller_version=to_install_tkr_kapp_controller_version if should_use_kapp_controller_version else "", # noqa: E501 + install_tanzu_cli_packages="true" if should_install_tanzu_cli_packages else "false", # noqa: E501 **proxy_config ) + formatted_script = formatted_script.replace("OPENBRACKET", "{") + formatted_script = formatted_script.replace("CLOSEBRACKET", "}") + + spec['cloudinit_node_spec'] = formatted_script + task = vapp.add_vms( vm_specs, power_on=False, @@ -2538,7 +2626,10 @@ def _add_worker_nodes(sysadmin_client, num_nodes, org, vdc, vapp, ) vapp.reload() - for spec in vm_specs: + kube_config = _get_kube_config_from_control_plane_vm( + sysadmin_client, vapp) + for ind in range(num_vm_specs): + spec = vm_specs[ind] vm_name = spec['target_vm_name'] vm_resource = vapp.get_vm(vm_name) vm = vcd_vm.VM(sysadmin_client, resource=vm_resource) @@ -2560,6 +2651,16 @@ def _add_worker_nodes(sysadmin_client, num_nodes, org, vdc, vapp, # create a cloud-init spec and update the VMs with it _set_cloud_init_spec(sysadmin_client, vapp, vm, spec['cloudinit_node_spec']) # noqa: E501 + should_use_kubeconfig: bool = ((ind == 0) or (ind == num_vm_specs - 1)) and len(core_pkg_versions_to_install) > 0 # noqa: E501 + if should_use_kubeconfig: + # The worker node will clear this value upon reading it or + # failure + task = vm.add_extra_config_element(PostCustomizationKubeconfig, kube_config) # noqa: E501 + sysadmin_client.get_task_monitor().wait_for_status( + task, + callback=wait_for_updating_kubeconfig + ) + task = vm.power_on() # wait_for_vm_power_on is reused for all vm creation callback sysadmin_client.get_task_monitor().wait_for_status( @@ -2576,6 +2677,7 @@ def _add_worker_nodes(sysadmin_client, num_nodes, org, vdc, vapp, PostCustomizationPhase.STORE_SSH_KEY, PostCustomizationPhase.PROXY_SETTING, PostCustomizationPhase.KUBEADM_NODE_JOIN, + PostCustomizationPhase.CORE_PACKAGES_ATTEMPTED_INSTALL, ]: vapp.reload() vcd_utils.wait_for_completion_of_post_customization_procedure( @@ -2585,6 +2687,19 @@ def _add_worker_nodes(sysadmin_client, num_nodes, org, vdc, vapp, ) vm.reload() + # get installed core pkg versions + if should_use_kubeconfig: + sysadmin_client.get_task_monitor().wait_for_status( + task, + callback=wait_for_updating_kubeconfig + ) + installed_core_pkg_versions[CorePkgVersionKeys.KAPP_CONTROLLER.value] = vcd_utils.get_vm_extra_config_element( # noqa: E501 + vm, + PostCustomizationVersions.INSTALLED_VERSION_OF_KAPP_CONTROLLER.value) # noqa: E501 + installed_core_pkg_versions[CorePkgVersionKeys.METRICS_SERVER.value] = vcd_utils.get_vm_extra_config_element( # noqa: E501 + vm, + PostCustomizationVersions.INSTALLED_VERSION_OF_METRICS_SERVER.value) # noqa: E501 + task = vm.add_extra_config_element(DISK_ENABLE_UUID, "1", True) # noqa: E501 sysadmin_client.get_task_monitor().wait_for_status( task, @@ -2605,7 +2720,7 @@ def _add_worker_nodes(sysadmin_client, num_nodes, org, vdc, vapp, raise exceptions.NodeCreationError(node_list, str(err)) - return vm_specs + return vm_specs, installed_core_pkg_versions def _get_node_names(vapp, node_type): @@ -2657,16 +2772,24 @@ def _get_join_cmd(sysadmin_client: vcd_client.Client, vapp): return control_plane_join_cmd -def _get_kube_config_from_control_plane_vm(sysadmin_client: vcd_client.Client, vapp): # noqa: E501 +def _get_control_plane_vm(sysadmin_client: vcd_client.Client, vapp): vcd_utils.raise_error_if_user_not_from_system_org(sysadmin_client) vapp.reload() node_names = _get_node_names(vapp, NodeType.CONTROL_PLANE) if not node_names: - raise exceptions.KubeconfigNotFound("No control plane node found") # noqa: E501 + raise Exception("No control plane node found") vm_resource = vapp.get_vm(node_names[0]) control_plane_vm = vcd_vm.VM(sysadmin_client, resource=vm_resource) control_plane_vm.reload() + return control_plane_vm + + +def _get_kube_config_from_control_plane_vm(sysadmin_client: vcd_client.Client, vapp): # noqa: E501 + try: + control_plane_vm = _get_control_plane_vm(sysadmin_client, vapp) + except Exception as e: + raise exceptions.KubeconfigNotFound(str(e)) kube_config: str = vcd_utils.get_vm_extra_config_element(control_plane_vm, KUBE_CONFIG) # noqa: E501 if not kube_config: raise exceptions.KubeconfigNotFound("kubeconfig not found in control plane extra configuration") # noqa: E501 @@ -2716,13 +2839,17 @@ def wait_for_updating_disk_enable_uuid(task): def wait_for_updating_cloud_init_spec(task): - LOGGER.debug(f"cloud init spec, status: {task.get('status').lower()}") # noqa: E501 + LOGGER.debug(f"cloud init spec, status: {task.get('status').lower()}") def wait_for_updating_cloud_init_spec_encoding(task): LOGGER.debug(f"cloud init spec encoding, status: {task.get('status').lower()}") # noqa: E501 +def wait_for_updating_kubeconfig(task): + LOGGER.debug(f"adding kubeconfig, status: {task.get('status').lower()}") + + def _create_k8s_software_string(software_name: str, software_version: str) -> str: # noqa: E501 """Generate string containing the software name and version. diff --git a/container_service_extension/rde/models/rde_2_1_0.py b/container_service_extension/rde/models/rde_2_1_0.py index 18d443f57..083f017a5 100644 --- a/container_service_extension/rde/models/rde_2_1_0.py +++ b/container_service_extension/rde/models/rde_2_1_0.py @@ -198,7 +198,6 @@ class Private: class TkgCorePackages: kapp_controller: Optional[str] = None metrics_server: Optional[str] = None - tanzu_addons_manager: Optional[str] = None @dataclass_json(letter_case=LetterCase.CAMEL) @@ -219,7 +218,7 @@ class Status: private: Optional[Private] = None csi: Optional[List[CsiElement]] = None cpi: Cpi = Cpi() - tkgCorePackages: TkgCorePackages = TkgCorePackages() + tkg_core_packages: TkgCorePackages = TkgCorePackages() @dataclass_json(letter_case=LetterCase.CAMEL) diff --git a/cse_def_schema/schema_2_1_0.json b/cse_def_schema/schema_2_1_0.json index 0a9808f0d..0260a0c35 100644 --- a/cse_def_schema/schema_2_1_0.json +++ b/cse_def_schema/schema_2_1_0.json @@ -480,10 +480,6 @@ "metricsServer":{ "type":"string", "description":"The metrics-server version." - }, - "tanzuAddonsManager":{ - "type":"string", - "description":"The tanzu-addons-manager version." } }, "additionalProperties":true