From aa1d8bc76b3c03a6c3b9bd8cbc6576f8d9288968 Mon Sep 17 00:00:00 2001 From: Michi Mutsuzaki Date: Tue, 25 Jun 2024 20:57:19 +0000 Subject: [PATCH] eks: Run cilium-cli inside a container Update {eks,eks-tunnel}.yaml to run cilium-cli inside a container instead of using cilium-cli-test-job-chart. Ref: #2623 Ref: #2627 Ref: cilium/design-cfps#9 Signed-off-by: Michi Mutsuzaki --- .github/in-cluster-test-scripts/eks-tunnel.sh | 42 ------ .../in-cluster-test-scripts/eks-uninstall.sh | 10 -- .github/in-cluster-test-scripts/eks.sh | 34 ----- .github/workflows/eks-tunnel.yaml | 122 ++++++++---------- .github/workflows/eks.yaml | 117 +++++++---------- 5 files changed, 100 insertions(+), 225 deletions(-) delete mode 100644 .github/in-cluster-test-scripts/eks-tunnel.sh delete mode 100644 .github/in-cluster-test-scripts/eks-uninstall.sh delete mode 100644 .github/in-cluster-test-scripts/eks.sh diff --git a/.github/in-cluster-test-scripts/eks-tunnel.sh b/.github/in-cluster-test-scripts/eks-tunnel.sh deleted file mode 100644 index 9dded9c0ed..0000000000 --- a/.github/in-cluster-test-scripts/eks-tunnel.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -set -x -set -e - -# Install Cilium -cilium install \ - --version "${CILIUM_VERSION}" \ - --set cluster.name="${CLUSTER_NAME}" \ - --wait=false \ - --set bpf.monitorAggregation=none \ - --datapath-mode=tunnel \ - --set loadBalancer.l7.backend=envoy \ - --set tls.secretsBackend=k8s \ - --set ipam.mode=cluster-pool - -# Enable Relay -cilium hubble enable - -# Wait for cilium and hubble relay to be ready -# NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918 -cilium status --wait - -# Make sure the 'aws-node' DaemonSet exists but has no scheduled pods -[[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]] - -# Port forward Relay -cilium hubble port-forward& -sleep 10s -[[ $(pgrep -f "cilium.*hubble.*port-forward|kubectl.*port-forward.*hubble-relay" | wc -l) == 2 ]] - -# Run connectivity test -cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com. \ - --test '!dns-only,!to-fqdns,!client-egress-l7,!health' - # workaround for nslookup issues in tunnel mode causing tests to fail reliably - # TODO: remove once: - # - https://github.com/cilium/cilium/issues/16975 is fixed - # - fix has been deployed to a stable branch - # - cilium-cli default cilium version has been updated to pick up the fix - -# Run performance test -cilium connectivity perf --duration 1s diff --git a/.github/in-cluster-test-scripts/eks-uninstall.sh b/.github/in-cluster-test-scripts/eks-uninstall.sh deleted file mode 100644 index be09d951bb..0000000000 --- a/.github/in-cluster-test-scripts/eks-uninstall.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -set -x -set -e - -# Uninstall Cilium -cilium uninstall --wait - -# Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed -[[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]] diff --git a/.github/in-cluster-test-scripts/eks.sh b/.github/in-cluster-test-scripts/eks.sh deleted file mode 100644 index f6034eba67..0000000000 --- a/.github/in-cluster-test-scripts/eks.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -set -x -set -e - -# Install Cilium -cilium install \ - --version "${CILIUM_VERSION}" \ - --set cluster.name="${CLUSTER_NAME}" \ - --wait=false \ - --set loadBalancer.l7.backend=envoy \ - --set tls.secretsBackend=k8s \ - --set bpf.monitorAggregation=none - -# Enable Relay -cilium hubble enable - -# Wait for cilium and hubble relay to be ready -# NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918 -cilium status --wait - -# Make sure the 'aws-node' DaemonSet exists but has no scheduled pods -[[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]] - -# Port forward Relay -cilium hubble port-forward& -sleep 10s -[[ $(pgrep -f "cilium.*hubble.*port-forward|kubectl.*port-forward.*hubble-relay" | wc -l) == 2 ]] - -# Run connectivity test -cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com. - -# Run performance test -cilium connectivity perf --duration 1s diff --git a/.github/workflows/eks-tunnel.yaml b/.github/workflows/eks-tunnel.yaml index 47285f6b9d..abdc56148d 100644 --- a/.github/workflows/eks-tunnel.yaml +++ b/.github/workflows/eks-tunnel.yaml @@ -63,20 +63,19 @@ jobs: sudo tar xzvfC eksctl_$(uname -s)_amd64.tar.gz /usr/bin rm eksctl_$(uname -s)_amd64.tar.gz - - name: Install helm - uses: azure/setup-helm@5119fcb9089d432beecbf79bb2c7915207344b78 # v3.5 - with: - # Due to the below issue, v3.8.2 is pinned currently to avoid - # exec plugin: invalid apiVersion "client.authentication.k8s.io/v1alpha1" - # https://github.com/helm/helm/issues/10975 - version: v3.8.2 - - name: Set up AWS CLI credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: role-to-assume: ${{ secrets.AWS_PR_ASSUME_ROLE }} aws-region: ${{ env.region }} + - name: Run aws configure + run: | + aws configure set aws_access_key_id ${{ env.AWS_ACCESS_KEY_ID }} + aws configure set aws_secret_access_key ${{ env.AWS_SECRET_ACCESS_KEY }} + aws configure set aws_session_token ${{ env.AWS_SESSION_TOKEN }} + aws configure set default.region ${{ env.AWS_REGION }} + - name: Set up job variables id: vars run: | @@ -126,14 +125,11 @@ jobs: eksctl create cluster -f ./eks-config.yaml - - name: Create kubeconfig and load it in configmap - run: | - .github/get-kubeconfig.sh - kubectl create configmap cilium-cli-kubeconfig -n kube-system --from-file kubeconfig - - - name: Load cilium cli script in configmap - run: | - kubectl create configmap cilium-cli-test-script -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-tunnel.sh + - name: Install Cilium CLI + uses: ./ + with: + skip-build: 'true' + image-tag: ${{ steps.vars.outputs.sha }} - name: Create cilium-cli test job run: | @@ -143,37 +139,50 @@ jobs: --set cilium_version=${{ env.cilium_version }} \ --set cluster_name=${{ env.clusterName }} - - name: Wait for job - env: - timeout: 30m + - name: Install Cilium and run tests + timeout-minutes: 30 run: | - # Background wait for job to complete or timeout - kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} & - complete_pid=$! - - # Background wait for job to fail - (kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) & - failed_pid=$! - - # Active wait for whichever background process ends first - wait -n $complete_pid $failed_pid - EXIT_CODE=$? - - # Retrieve job logs - kubectl logs --timestamps -n kube-system job/cilium-cli - exit ${EXIT_CODE} - shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently + # Install Cilium + cilium install \ + --version "${{ env.cilium_version }}" \ + --set cluster.name="${{ env.clusterName }}" \ + --wait=false \ + --set bpf.monitorAggregation=none \ + --datapath-mode=tunnel \ + --set loadBalancer.l7.backend=envoy \ + --set tls.secretsBackend=k8s \ + --set ipam.mode=cluster-pool + + # Enable Relay + cilium hubble enable + + # Wait for cilium and hubble relay to be ready + # NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918 + cilium status --wait + + # Make sure the 'aws-node' DaemonSet exists but has no scheduled pods + [[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]] + + # Port forward Relay + cilium hubble port-forward& + sleep 10s + [[ $(pgrep -f "kubectl.*port-forward.*hubble-relay" | wc -l) == 1 ]] + + # Run connectivity test + cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com. \ + --test '!dns-only,!to-fqdns,!client-egress-l7,!health' + # workaround for nslookup issues in tunnel mode causing tests to fail reliably + # TODO: remove once: + # - https://github.com/cilium/cilium/issues/16975 is fixed + # - fix has been deployed to a stable branch + # - cilium-cli default cilium version has been updated to pick up the fix + + # Run performance test + cilium connectivity perf --duration 1s - name: Post-test information gathering if: ${{ !success() }} run: | - echo "=== Install latest stable CLI ===" - curl -sSL --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz{,.sha256sum} - sha256sum --check cilium-linux-amd64.tar.gz.sha256sum - sudo tar xzvfC cilium-linux-amd64.tar.gz /usr/bin - rm cilium-linux-amd64.tar.gz{,.sha256sum} - cilium version - echo "=== Retrieve cluster state ===" kubectl get pods --all-namespaces -o wide cilium status @@ -182,33 +191,12 @@ jobs: - name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed if: ${{ success() }} - env: - timeout: 5m + timeout-minutes: 5 run: | - kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh - helm install .github/cilium-cli-test-job-chart \ - --generate-name \ - --set tag=${{ steps.vars.outputs.sha }} \ - --set cluster_name=${{ env.clusterName }} \ - --set job_name=cilium-cli-uninstall \ - --set test_script_cm=cilium-cli-test-script-uninstall - - # Background wait for job to complete or timeout - kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} & - complete_pid=$! - - # Background wait for job to fail - (kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) & - failed_pid=$! + cilium uninstall --wait - # Active wait for whichever background process ends first - wait -n $complete_pid $failed_pid - EXIT_CODE=$? - - # Retrieve job logs - kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall - exit ${EXIT_CODE} - shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently + # Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed + [[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]] - name: Clean up EKS if: ${{ always() }} diff --git a/.github/workflows/eks.yaml b/.github/workflows/eks.yaml index 90b97d0f91..d4a71eac7c 100644 --- a/.github/workflows/eks.yaml +++ b/.github/workflows/eks.yaml @@ -63,23 +63,23 @@ jobs: sudo tar xzvfC eksctl_$(uname -s)_amd64.tar.gz /usr/bin rm eksctl_$(uname -s)_amd64.tar.gz - - name: Install helm - uses: azure/setup-helm@5119fcb9089d432beecbf79bb2c7915207344b78 # v3.5 - with: - # Due to the below issue, v3.8.2 is pinned currently to avoid - # exec plugin: invalid apiVersion "client.authentication.k8s.io/v1alpha1" - # https://github.com/helm/helm/issues/10975 - version: v3.8.2 - - name: Set up AWS CLI credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: role-to-assume: ${{ secrets.AWS_PR_ASSUME_ROLE }} aws-region: ${{ env.region }} + - name: Run aws configure + run: | + aws configure set aws_access_key_id ${{ env.AWS_ACCESS_KEY_ID }} + aws configure set aws_secret_access_key ${{ env.AWS_SECRET_ACCESS_KEY }} + aws configure set aws_session_token ${{ env.AWS_SESSION_TOKEN }} + aws configure set default.region ${{ env.AWS_REGION }} + - name: Set up job variables id: vars run: | + env if [ ${{ github.event.issue.pull_request || github.event.pull_request }} ]; then PR_API_JSON=$(curl \ -H "Accept: application/vnd.github.v3+json" \ @@ -126,54 +126,48 @@ jobs: eksctl create cluster -f ./eks-config.yaml - - name: Create kubeconfig and load it in configmap - run: | - .github/get-kubeconfig.sh - kubectl create configmap cilium-cli-kubeconfig -n kube-system --from-file kubeconfig + - name: Install Cilium CLI + uses: ./ + with: + skip-build: 'true' + image-tag: ${{ steps.vars.outputs.sha }} - - name: Load cilium cli script in configmap + - name: Install Cilium and run tests + timeout-minutes: 30 run: | - kubectl create configmap cilium-cli-test-script -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks.sh + # Install Cilium + cilium install \ + --version "${{ env.cilium_version }}" \ + --set cluster.name="${{ env.clusterName }}" \ + --wait=false \ + --set loadBalancer.l7.backend=envoy \ + --set tls.secretsBackend=k8s \ + --set bpf.monitorAggregation=none - - name: Create cilium-cli test job - run: | - helm install .github/cilium-cli-test-job-chart \ - --generate-name \ - --set tag=${{ steps.vars.outputs.sha }} \ - --set cilium_version=${{ env.cilium_version }} \ - --set cluster_name=${{ env.clusterName }} - - - name: Wait for job - env: - timeout: 30m - run: | - # Background wait for job to complete or timeout - kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} & - complete_pid=$! + # Enable Relay + cilium hubble enable - # Background wait for job to fail - (kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) & - failed_pid=$! + # Wait for cilium and hubble relay to be ready + # NB: necessary to work against occasional flakes due to https://github.com/cilium/cilium-cli/issues/918 + cilium status --wait - # Active wait for whichever background process ends first - wait -n $complete_pid $failed_pid - EXIT_CODE=$? + # Make sure the 'aws-node' DaemonSet exists but has no scheduled pods + [[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]] - # Retrieve job logs - kubectl logs --timestamps -n kube-system job/cilium-cli - exit ${EXIT_CODE} - shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently + # Port forward Relay + cilium hubble port-forward& + sleep 10s + [[ $(pgrep -f "kubectl.*port-forward.*hubble-relay" | wc -l) == 1 ]] + + # Run connectivity test + cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com. + + # Run performance test + cilium connectivity perf --duration 1s - name: Post-test information gathering if: ${{ !success() }} run: | - echo "=== Install latest stable CLI ===" - curl -sSL --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz{,.sha256sum} - sha256sum --check cilium-linux-amd64.tar.gz.sha256sum - sudo tar xzvfC cilium-linux-amd64.tar.gz /usr/bin - rm cilium-linux-amd64.tar.gz{,.sha256sum} - cilium version - echo "=== Retrieve cluster state ===" kubectl get pods --all-namespaces -o wide cilium status @@ -182,33 +176,12 @@ jobs: - name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed if: ${{ success() }} - env: - timeout: 5m + timeout-minutes: 5 run: | - kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh - helm install .github/cilium-cli-test-job-chart \ - --generate-name \ - --set tag=${{ steps.vars.outputs.sha }} \ - --set cluster_name=${{ env.clusterName }} \ - --set job_name=cilium-cli-uninstall \ - --set test_script_cm=cilium-cli-test-script-uninstall - - # Background wait for job to complete or timeout - kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} & - complete_pid=$! - - # Background wait for job to fail - (kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) & - failed_pid=$! - - # Active wait for whichever background process ends first - wait -n $complete_pid $failed_pid - EXIT_CODE=$? - - # Retrieve job logs - kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall - exit ${EXIT_CODE} - shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently + cilium uninstall --wait + + # Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed + [[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]] - name: Clean up EKS if: ${{ always() }}