From aa1d8bc76b3c03a6c3b9bd8cbc6576f8d9288968 Mon Sep 17 00:00:00 2001
From: Michi Mutsuzaki <michi@isovalent.com>
Date: Tue, 25 Jun 2024 20:57:19 +0000
Subject: [PATCH] eks: Run cilium-cli inside a container

Update {eks,eks-tunnel}.yaml to run cilium-cli inside a container
instead of using cilium-cli-test-job-chart.

Ref: #2623
Ref: #2627
Ref: cilium/design-cfps#9

Signed-off-by: Michi Mutsuzaki <michi@isovalent.com>
---
 .github/in-cluster-test-scripts/eks-tunnel.sh |  42 ------
 .../in-cluster-test-scripts/eks-uninstall.sh  |  10 --
 .github/in-cluster-test-scripts/eks.sh        |  34 -----
 .github/workflows/eks-tunnel.yaml             | 122 ++++++++----------
 .github/workflows/eks.yaml                    | 117 +++++++----------
 5 files changed, 100 insertions(+), 225 deletions(-)
 delete mode 100644 .github/in-cluster-test-scripts/eks-tunnel.sh
 delete mode 100644 .github/in-cluster-test-scripts/eks-uninstall.sh
 delete mode 100644 .github/in-cluster-test-scripts/eks.sh

diff --git a/.github/in-cluster-test-scripts/eks-tunnel.sh b/.github/in-cluster-test-scripts/eks-tunnel.sh
deleted file mode 100644
index 9dded9c0ed..0000000000
--- a/.github/in-cluster-test-scripts/eks-tunnel.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-set -x
-set -e
-
-# Install Cilium
-cilium install \
-  --version "${CILIUM_VERSION}" \
-  --set cluster.name="${CLUSTER_NAME}" \
-  --wait=false \
-  --set bpf.monitorAggregation=none \
-  --datapath-mode=tunnel \
-  --set loadBalancer.l7.backend=envoy \
-  --set tls.secretsBackend=k8s \
-  --set ipam.mode=cluster-pool
-
-# Enable Relay
-cilium hubble enable
-
-# Wait for cilium and hubble relay to be ready
-# NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918
-cilium status --wait
-
-# Make sure the 'aws-node' DaemonSet exists but has no scheduled pods
-[[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]]
-
-# Port forward Relay
-cilium hubble port-forward&
-sleep 10s
-[[ $(pgrep -f "cilium.*hubble.*port-forward|kubectl.*port-forward.*hubble-relay" | wc -l) == 2 ]]
-
-# Run connectivity test
-cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com. \
-  --test '!dns-only,!to-fqdns,!client-egress-l7,!health'
-  # workaround for nslookup issues in tunnel mode causing tests to fail reliably
-  # TODO: remove once:
-  # - https://github.com/cilium/cilium/issues/16975 is fixed
-  # - fix has been deployed to a stable branch
-  # - cilium-cli default cilium version has been updated to pick up the fix
-
-# Run performance test
-cilium connectivity perf --duration 1s
diff --git a/.github/in-cluster-test-scripts/eks-uninstall.sh b/.github/in-cluster-test-scripts/eks-uninstall.sh
deleted file mode 100644
index be09d951bb..0000000000
--- a/.github/in-cluster-test-scripts/eks-uninstall.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-set -x
-set -e
-
-# Uninstall Cilium
-cilium uninstall --wait
-
-# Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
-[[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]]
diff --git a/.github/in-cluster-test-scripts/eks.sh b/.github/in-cluster-test-scripts/eks.sh
deleted file mode 100644
index f6034eba67..0000000000
--- a/.github/in-cluster-test-scripts/eks.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-set -x
-set -e
-
-# Install Cilium
-cilium install \
-  --version "${CILIUM_VERSION}" \
-  --set cluster.name="${CLUSTER_NAME}" \
-  --wait=false \
-  --set loadBalancer.l7.backend=envoy \
-  --set tls.secretsBackend=k8s \
-  --set bpf.monitorAggregation=none
-
-# Enable Relay
-cilium hubble enable
-
-# Wait for cilium and hubble relay to be ready
-# NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918
-cilium status --wait
-
-# Make sure the 'aws-node' DaemonSet exists but has no scheduled pods
-[[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]]
-
-# Port forward Relay
-cilium hubble port-forward&
-sleep 10s
-[[ $(pgrep -f "cilium.*hubble.*port-forward|kubectl.*port-forward.*hubble-relay" | wc -l) == 2 ]]
-
-# Run connectivity test
-cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com.
-
-# Run performance test
-cilium connectivity perf --duration 1s
diff --git a/.github/workflows/eks-tunnel.yaml b/.github/workflows/eks-tunnel.yaml
index 47285f6b9d..abdc56148d 100644
--- a/.github/workflows/eks-tunnel.yaml
+++ b/.github/workflows/eks-tunnel.yaml
@@ -63,20 +63,19 @@ jobs:
           sudo tar xzvfC eksctl_$(uname -s)_amd64.tar.gz /usr/bin
           rm eksctl_$(uname -s)_amd64.tar.gz
 
-      - name: Install helm
-        uses: azure/setup-helm@5119fcb9089d432beecbf79bb2c7915207344b78 # v3.5
-        with:
-          # Due to the below issue, v3.8.2 is pinned currently to avoid
-          # exec plugin: invalid apiVersion "client.authentication.k8s.io/v1alpha1"
-          # https://github.com/helm/helm/issues/10975
-          version: v3.8.2
-
       - name: Set up AWS CLI credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
           role-to-assume: ${{ secrets.AWS_PR_ASSUME_ROLE }}
           aws-region: ${{ env.region }}
 
+      - name: Run aws configure
+        run: |
+          aws configure set aws_access_key_id ${{ env.AWS_ACCESS_KEY_ID }}
+          aws configure set aws_secret_access_key ${{ env.AWS_SECRET_ACCESS_KEY }}
+          aws configure set aws_session_token ${{ env.AWS_SESSION_TOKEN }}
+          aws configure set default.region ${{ env.AWS_REGION }}
+
       - name: Set up job variables
         id: vars
         run: |
@@ -126,14 +125,11 @@ jobs:
 
           eksctl create cluster -f ./eks-config.yaml
 
-      - name: Create kubeconfig and load it in configmap
-        run: |
-          .github/get-kubeconfig.sh
-          kubectl create configmap cilium-cli-kubeconfig -n kube-system --from-file kubeconfig
-
-      - name: Load cilium cli script in configmap
-        run: |
-          kubectl create configmap cilium-cli-test-script -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-tunnel.sh
+      - name: Install Cilium CLI
+        uses: ./
+        with:
+          skip-build: 'true'
+          image-tag: ${{ steps.vars.outputs.sha }}
 
       - name: Create cilium-cli test job
         run: |
@@ -143,37 +139,50 @@ jobs:
             --set cilium_version=${{ env.cilium_version }} \
             --set cluster_name=${{ env.clusterName }}
 
-      - name: Wait for job
-        env:
-          timeout: 30m
+      - name: Install Cilium and run tests
+        timeout-minutes: 30
         run: |
-          # Background wait for job to complete or timeout
-          kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
-          complete_pid=$!
-
-          # Background wait for job to fail
-          (kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
-          failed_pid=$!
-
-          # Active wait for whichever background process ends first
-          wait -n $complete_pid $failed_pid
-          EXIT_CODE=$?
-
-          # Retrieve job logs
-          kubectl logs --timestamps -n kube-system job/cilium-cli
-          exit ${EXIT_CODE}
-        shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
+          # Install Cilium
+          cilium install \
+            --version "${{ env.cilium_version }}" \
+            --set cluster.name="${{ env.clusterName }}" \
+            --wait=false \
+            --set bpf.monitorAggregation=none \
+            --datapath-mode=tunnel \
+            --set loadBalancer.l7.backend=envoy \
+            --set tls.secretsBackend=k8s \
+            --set ipam.mode=cluster-pool
+
+          # Enable Relay
+          cilium hubble enable
+
+          # Wait for cilium and hubble relay to be ready
+          # NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918
+          cilium status --wait
+
+          # Make sure the 'aws-node' DaemonSet exists but has no scheduled pods
+          [[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]]
+
+          # Port forward Relay
+          cilium hubble port-forward&
+          sleep 10s
+          [[ $(pgrep -f "kubectl.*port-forward.*hubble-relay" | wc -l) == 1 ]]
+
+          # Run connectivity test
+          cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com. \
+            --test '!dns-only,!to-fqdns,!client-egress-l7,!health'
+            # workaround for nslookup issues in tunnel mode causing tests to fail reliably
+            # TODO: remove once:
+            # - https://github.com/cilium/cilium/issues/16975 is fixed
+            # - fix has been deployed to a stable branch
+            # - cilium-cli default cilium version has been updated to pick up the fix
+
+          # Run performance test
+          cilium connectivity perf --duration 1s
 
       - name: Post-test information gathering
         if: ${{ !success() }}
         run: |
-          echo "=== Install latest stable CLI ==="
-          curl -sSL --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz{,.sha256sum}
-          sha256sum --check cilium-linux-amd64.tar.gz.sha256sum
-          sudo tar xzvfC cilium-linux-amd64.tar.gz /usr/bin
-          rm cilium-linux-amd64.tar.gz{,.sha256sum}
-          cilium version
-
           echo "=== Retrieve cluster state ==="
           kubectl get pods --all-namespaces -o wide
           cilium status
@@ -182,33 +191,12 @@ jobs:
 
       - name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
         if: ${{ success() }}
-        env:
-          timeout: 5m
+        timeout-minutes: 5
         run: |
-          kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh
-          helm install .github/cilium-cli-test-job-chart \
-            --generate-name \
-            --set tag=${{ steps.vars.outputs.sha }} \
-            --set cluster_name=${{ env.clusterName }} \
-            --set job_name=cilium-cli-uninstall \
-            --set test_script_cm=cilium-cli-test-script-uninstall
-
-          # Background wait for job to complete or timeout
-          kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} &
-          complete_pid=$!
-
-          # Background wait for job to fail
-          (kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
-          failed_pid=$!
+          cilium uninstall --wait
 
-          # Active wait for whichever background process ends first
-          wait -n $complete_pid $failed_pid
-          EXIT_CODE=$?
-
-          # Retrieve job logs
-          kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall
-          exit ${EXIT_CODE}
-        shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
+          # Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
+          [[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]]
 
       - name: Clean up EKS
         if: ${{ always() }}
diff --git a/.github/workflows/eks.yaml b/.github/workflows/eks.yaml
index 90b97d0f91..d4a71eac7c 100644
--- a/.github/workflows/eks.yaml
+++ b/.github/workflows/eks.yaml
@@ -63,23 +63,23 @@ jobs:
           sudo tar xzvfC eksctl_$(uname -s)_amd64.tar.gz /usr/bin
           rm eksctl_$(uname -s)_amd64.tar.gz
 
-      - name: Install helm
-        uses: azure/setup-helm@5119fcb9089d432beecbf79bb2c7915207344b78 # v3.5
-        with:
-          # Due to the below issue, v3.8.2 is pinned currently to avoid
-          # exec plugin: invalid apiVersion "client.authentication.k8s.io/v1alpha1"
-          # https://github.com/helm/helm/issues/10975
-          version: v3.8.2
-
       - name: Set up AWS CLI credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
           role-to-assume: ${{ secrets.AWS_PR_ASSUME_ROLE }}
           aws-region: ${{ env.region }}
 
+      - name: Run aws configure
+        run: |
+          aws configure set aws_access_key_id ${{ env.AWS_ACCESS_KEY_ID }}
+          aws configure set aws_secret_access_key ${{ env.AWS_SECRET_ACCESS_KEY }}
+          aws configure set aws_session_token ${{ env.AWS_SESSION_TOKEN }}
+          aws configure set default.region ${{ env.AWS_REGION }}
+
       - name: Set up job variables
         id: vars
         run: |
+          env
           if [ ${{ github.event.issue.pull_request || github.event.pull_request }} ]; then
             PR_API_JSON=$(curl \
               -H "Accept: application/vnd.github.v3+json" \
@@ -126,54 +126,48 @@ jobs:
 
           eksctl create cluster -f ./eks-config.yaml
 
-      - name: Create kubeconfig and load it in configmap
-        run: |
-          .github/get-kubeconfig.sh
-          kubectl create configmap cilium-cli-kubeconfig -n kube-system --from-file kubeconfig
+      - name: Install Cilium CLI
+        uses: ./
+        with:
+          skip-build: 'true'
+          image-tag: ${{ steps.vars.outputs.sha }}
 
-      - name: Load cilium cli script in configmap
+      - name: Install Cilium and run tests
+        timeout-minutes: 30
         run: |
-          kubectl create configmap cilium-cli-test-script -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks.sh
+          # Install Cilium
+          cilium install \
+            --version "${{ env.cilium_version }}" \
+            --set cluster.name="${{ env.clusterName }}" \
+            --wait=false \
+            --set loadBalancer.l7.backend=envoy \
+            --set tls.secretsBackend=k8s \
+            --set bpf.monitorAggregation=none
 
-      - name: Create cilium-cli test job
-        run: |
-          helm install .github/cilium-cli-test-job-chart \
-            --generate-name \
-            --set tag=${{ steps.vars.outputs.sha }} \
-            --set cilium_version=${{ env.cilium_version }} \
-            --set cluster_name=${{ env.clusterName }}
-
-      - name: Wait for job
-        env:
-          timeout: 30m
-        run: |
-          # Background wait for job to complete or timeout
-          kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
-          complete_pid=$!
+          # Enable Relay
+          cilium hubble enable
 
-          # Background wait for job to fail
-          (kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
-          failed_pid=$!
+          # Wait for cilium and hubble relay to be ready
+          # NB: necessary to work against occasional flakes due to https://github.com/cilium/cilium-cli/issues/918
+          cilium status --wait
 
-          # Active wait for whichever background process ends first
-          wait -n $complete_pid $failed_pid
-          EXIT_CODE=$?
+          # Make sure the 'aws-node' DaemonSet exists but has no scheduled pods
+          [[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]]
 
-          # Retrieve job logs
-          kubectl logs --timestamps -n kube-system job/cilium-cli
-          exit ${EXIT_CODE}
-        shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
+          # Port forward Relay
+          cilium hubble port-forward&
+          sleep 10s
+          [[ $(pgrep -f "kubectl.*port-forward.*hubble-relay" | wc -l) == 1 ]]
+
+          # Run connectivity test
+          cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com.
+
+          # Run performance test
+          cilium connectivity perf --duration 1s
 
       - name: Post-test information gathering
         if: ${{ !success() }}
         run: |
-          echo "=== Install latest stable CLI ==="
-          curl -sSL --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz{,.sha256sum}
-          sha256sum --check cilium-linux-amd64.tar.gz.sha256sum
-          sudo tar xzvfC cilium-linux-amd64.tar.gz /usr/bin
-          rm cilium-linux-amd64.tar.gz{,.sha256sum}
-          cilium version
-
           echo "=== Retrieve cluster state ==="
           kubectl get pods --all-namespaces -o wide
           cilium status
@@ -182,33 +176,12 @@ jobs:
 
       - name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
         if: ${{ success() }}
-        env:
-          timeout: 5m
+        timeout-minutes: 5
         run: |
-          kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh
-          helm install .github/cilium-cli-test-job-chart \
-            --generate-name \
-            --set tag=${{ steps.vars.outputs.sha }} \
-            --set cluster_name=${{ env.clusterName }} \
-            --set job_name=cilium-cli-uninstall \
-            --set test_script_cm=cilium-cli-test-script-uninstall
-
-          # Background wait for job to complete or timeout
-          kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} &
-          complete_pid=$!
-
-          # Background wait for job to fail
-          (kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
-          failed_pid=$!
-
-          # Active wait for whichever background process ends first
-          wait -n $complete_pid $failed_pid
-          EXIT_CODE=$?
-
-          # Retrieve job logs
-          kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall
-          exit ${EXIT_CODE}
-        shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
+          cilium uninstall --wait
+
+          # Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
+          [[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]]
 
       - name: Clean up EKS
         if: ${{ always() }}