From ea9526c5ea029d83414af8850b688dd45835cf0f Mon Sep 17 00:00:00 2001 From: Zach Loafman Date: Fri, 31 Mar 2023 20:48:20 +0000 Subject: [PATCH 1/2] Extend e2e queue timings This PR reworks the e2e timeouts to allow for more time for a given build to wait to run e2es, but tightens the e2e deadline slightly: * Tighten the per-e2e-configuration testcase to 1.5h. e2es are coming in close to an hour in some cases but now that we're not running consul, we don't need it as high as 2h. I don't think it's worth tightening this all the way to an hour, though it would probably work. * Also drops the queueTtl for the CI sub-builds, these should not be queued for long since we serialize e2es now. * Extends the e2e-wait-to-become-leader timeout to 3h. In higher traffic times, we're hitting this limit often now, which only results in a vicious cycle of retrying PRs. Instead wait longer to become leader. * Bumps the global timeout to 5h after aggregating: 3h (e2e-wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else) * Remove vestigates of consul - it's no longer running anywhere. --- build/e2e-image/entrypoint.sh | 21 ++------------------- ci/e2e-test-cloudbuild.yaml | 4 ++-- cloudbuild.yaml | 4 ++-- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/build/e2e-image/entrypoint.sh b/build/e2e-image/entrypoint.sh index e3ff4e7402..841e412c91 100644 --- a/build/e2e-image/entrypoint.sh +++ b/build/e2e-image/entrypoint.sh @@ -33,22 +33,5 @@ fi gcloud container clusters get-credentials $TEST_CLUSTER_NAME \ --zone=${TEST_CLUSTER_LOCATION} --project=agones-images -# TODO: Here we're using the presence of consul to dictate whether we use consul -# port forwarding or whether we rely on Cloud Build serialization from #2932. This -# allows us to quickly recover (by reinstalling consul) if something breaks. -# After a few more days of PRs, it should be safe to remove this. -if kubectl get statefulset/consul-consul-server -oname >& /dev/null -then - echo "Using legacy consul locking" - kubectl port-forward statefulset/consul-consul-server 8500:8500 & - echo "Waiting consul port-forward to launch on 8500..." - timeout 60 bash -c 'until printf "" 2>>/dev/null >>/dev/tcp/$0/$1; do sleep 1; done' 127.0.0.1 8500 - echo "consul port-forward launched. Starting e2e tests..." - echo "consul lock -child-exit-code=true -timeout 90m -verbose LockE2E '/root/e2e.sh "$FEATURES" "$CLOUD_PRODUCT" "$REGISTRY"'" - consul lock -child-exit-code=true -timeout 90m -verbose LockE2E '/root/e2e.sh "'$FEATURES'" "'$CLOUD_PRODUCT'" "'$REGISTRY'"' - killall -q kubectl || true - echo "successfully killed kubectl proxy" -else - echo /root/e2e.sh "${FEATURES}" "${CLOUD_PRODUCT}" "${REGISTRY}" - /root/e2e.sh "${FEATURES}" "${CLOUD_PRODUCT}" "${REGISTRY}" -fi +echo /root/e2e.sh "${FEATURES}" "${CLOUD_PRODUCT}" "${REGISTRY}" +/root/e2e.sh "${FEATURES}" "${CLOUD_PRODUCT}" "${REGISTRY}" diff --git a/ci/e2e-test-cloudbuild.yaml b/ci/e2e-test-cloudbuild.yaml index e90d777460..907624b891 100644 --- a/ci/e2e-test-cloudbuild.yaml +++ b/ci/e2e-test-cloudbuild.yaml @@ -60,6 +60,6 @@ steps: - e2e-feature-gates tags: ['e2e-test'] -timeout: 7200s # 2h -queueTtl: 21600s # 6h +timeout: 5400s # 1.5h +queueTtl: 7200s # 2h // only one set of e2es should be running at once diff --git a/cloudbuild.yaml b/cloudbuild.yaml index fe71aafd73..548fa2d541 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -271,7 +271,7 @@ steps: sleep 60 done - timeout: 5400s # 90m - leave an hour for e2es to run on top of the global timeout of 2.5h + timeout: 10800s # 3h - if you change this, change the global timeout as well env: - 'CLOUDSDK_CORE_PROJECT=$PROJECT_ID' - 'BUILD_ID=$BUILD_ID' @@ -414,7 +414,7 @@ substitutions: _RUST_SDK_BUILD_CACHE_KEY: rust-sdk-build _REGISTRY: us-docker.pkg.dev/${PROJECT_ID}/ci tags: ['ci'] -timeout: 9000s # 2.5h - if you change this, change e2e-wait-to-become-leader as well +timeout: 18000s # 5h: 3h (e2e-wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else) queueTtl: 259200s # 72h images: - '${_REGISTRY}/agones-controller' From d713c7abb866ffcec5c0c5b5148262b573aeecc2 Mon Sep 17 00:00:00 2001 From: Zach Loafman Date: Mon, 3 Apr 2023 22:36:46 +0000 Subject: [PATCH 2/2] Stop testing on Autopilot 1.26 until after #3046 --- cloudbuild.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 548fa2d541..f06d8c5b56 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -307,6 +307,10 @@ steps: for version in "${!versionsAndRegions[@]}" do region=${versionsAndRegions[$version]} + if [ $cloudProduct = gke-autopilot ] && [ $version = 1.26 ] + then + continue + fi if [ $cloudProduct = generic ] then featureWithGate="CustomFasSyncInterval=false&SafeToEvict=false&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&PodHostname=true&SplitControllerAndExtensions=true&FleetAllocationOverflow=true&Example=true"