From ca8d9f4a8dcb1bdb7be066866e96321cc5c1bf86 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 16 Oct 2024 15:51:11 -0400 Subject: [PATCH] Github pipelines and utils for running CI on parallel works (#3007) # Description This PR has the GitHub Pipeline script in the `github/workflows` directory for running CI tests to be preformed an AWS virtual cluster. It is setup to be launched from the dispatch action from the Actions tab. For now it will only run C48_ATM Resolves #3006 Once the yaml pipeline is in `.github/workflows` directory of the default branch we can test it against [PR 2977](https://github.com/NOAA-EMC/global-workflow/pull/2977) which may be needed to build on Parallel Works Centos AWS. Code managers can check to see if the self-hosted runner [globalworkflow_parallelworks](https://github.com/NOAA-EMC/global-workflow/settings/actions/runners/22) is up and ready by checking the [Running](https://github.com/NOAA-EMC/global-workflow/settings/actions/runners) Settings. In pending work we should also be able spin up the cluster on demand from GitHub as well. # Type of change - [ ] Bug fix (fixes something broken) - [ ] New feature (adds functionality) - [x] Maintenance (code refactor, clean-up, new CI test, etc.) # Change characteristics - Is this a breaking change (a change in existing functionality)? YES/NO - Does this change require a documentation update? YES/NO - Does this change require an update to any of the following submodules? YES/NO (If YES, please add a link to any PRs that are pending.) - [ ] EMC verif-global - [ ] GDAS - [ ] GFS-utils - [ ] GSI - [ ] GSI-monitor - [ ] GSI-utils - [ ] UFS-utils - [ ] UFS-weather-model - [ ] wxflow # How has this been tested? # Checklist - [ ] Any dependent changes have been merged and published - [x] My code follows the style guidelines of this project - [ ] I have performed a self-review of my own code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have documented my code, including function, input, and output descriptions - [ ] My changes generate no new warnings - [ ] New and existing tests pass with my changes - [x] This change is covered by an existing CI test or a new one has been added - [ ] I have made corresponding changes to the system documentation if necessary --------- Co-authored-by: tmcguinness --- ...balworkflow-ci.yaml => pw_aws_centos.yaml} | 46 ++++++++++--------- .../parallel_works/UserBootstrap_centos7.txt | 5 ++ .../utils/parallel_works/provision_runner.sh | 39 ++++++++++++++++ 3 files changed, 68 insertions(+), 22 deletions(-) rename .github/workflows/{globalworkflow-ci.yaml => pw_aws_centos.yaml} (63%) create mode 100644 ci/scripts/utils/parallel_works/UserBootstrap_centos7.txt create mode 100755 ci/scripts/utils/parallel_works/provision_runner.sh diff --git a/.github/workflows/globalworkflow-ci.yaml b/.github/workflows/pw_aws_centos.yaml similarity index 63% rename from .github/workflows/globalworkflow-ci.yaml rename to .github/workflows/pw_aws_centos.yaml index 1474c79a48..549a3ea0fa 100644 --- a/.github/workflows/globalworkflow-ci.yaml +++ b/.github/workflows/pw_aws_centos.yaml @@ -1,4 +1,4 @@ -name: gw-ci-orion +name: gw-ci-aws-centos on: [workflow_dispatch] @@ -15,28 +15,31 @@ on: [workflow_dispatch] # └── ${pslot} env: TEST_DIR: ${{ github.workspace }}/${{ github.run_id }} - MACHINE_ID: orion + MACHINE_ID: noaacloud jobs: - checkout-build-link: - runs-on: [self-hosted, orion-ready] + checkout: + runs-on: [self-hosted, aws, parallelworks, centos] timeout-minutes: 600 steps: + - name: Checkout global-workflow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: - path: ${{ github.run_id }}/HOMEgfs # This path needs to be relative + path: ${{ github.run_id }}/HOMEgfs + submodules: 'recursive' - - name: Checkout components - run: | - cd ${{ env.TEST_DIR }}/HOMEgfs/sorc - ./checkout.sh -c -g # Options e.g. -u can be added late + build-link: + runs-on: [self-hosted, aws, parallelworks, centos] + needs: checkout + + steps: - name: Build components run: | cd ${{ env.TEST_DIR }}/HOMEgfs/sorc - ./build_all.sh + ./build_all.sh -j 8 - name: Link artifacts run: | @@ -44,43 +47,42 @@ jobs: ./link_workflow.sh create-experiments: - needs: checkout-build-link - runs-on: [self-hosted, orion-ready] + needs: checkout + runs-on: [self-hosted, aws, parallelworks, centos] strategy: matrix: - case: ["C48_S2S", "C96_atm3DVar"] + case: ["C48_ATM"] steps: - name: Create Experiments ${{ matrix.case }} env: - HOMEgfs_PR: ${{ env.TEST_DIR }}/HOMEgfs RUNTESTS: ${{ env.TEST_DIR }}/RUNTESTS pslot: ${{ matrix.case }}.${{ github.run_id }} run: | + mkdir -p ${{ env.RUNTESTS }} cd ${{ env.TEST_DIR }}/HOMEgfs source workflow/gw_setup.sh - source ci/platforms/orion.sh - ./ci/scripts/create_experiment.py --yaml ci/cases/${{ matrix.case }}.yaml --dir ${{ env.HOMEgfs_PR }} + source ci/platforms/config.noaacloud + ./workflow/create_experiment.py --yaml ci/cases/pr/${{ matrix.case }}.yaml --overwrite run-experiments: needs: create-experiments - runs-on: [self-hosted, orion-ready] + runs-on: [self-hosted, aws, parallelworks, centos] strategy: max-parallel: 2 matrix: - case: ["C48_S2S", "C96_atm3DVar"] + case: ["C48_ATM"] steps: - name: Run Experiment ${{ matrix.case }} run: | cd ${{ env.TEST_DIR }}/HOMEgfs - ./ci/scripts/run-check_ci.sh ${{ env.TEST_DIR }} ${{ matrix.case }}.${{ github.run_id }} + ./ci/scripts/run-check_ci.sh ${{ env.TEST_DIR }} ${{ matrix.case }}.${{ github.run_id }} HOMEgfs clean-up: needs: run-experiments - runs-on: [self-hosted, orion-ready] + runs-on: [self-hosted, aws, parallelworks, centos] steps: - name: Clean-up run: | cd ${{ github.workspace }} rm -rf ${{ github.run_id }} - diff --git a/ci/scripts/utils/parallel_works/UserBootstrap_centos7.txt b/ci/scripts/utils/parallel_works/UserBootstrap_centos7.txt new file mode 100644 index 0000000000..ddc6b05706 --- /dev/null +++ b/ci/scripts/utils/parallel_works/UserBootstrap_centos7.txt @@ -0,0 +1,5 @@ +sudo yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo.x86_64.rpm +sudo yum -y install git +/contrib/Terry.McGuinness/SETUP/provision_runner.sh +ALLNODES +/contrib/Terry.McGuinness/SETUP/mount-epic-contrib.sh \ No newline at end of file diff --git a/ci/scripts/utils/parallel_works/provision_runner.sh b/ci/scripts/utils/parallel_works/provision_runner.sh new file mode 100755 index 0000000000..cac18c9315 --- /dev/null +++ b/ci/scripts/utils/parallel_works/provision_runner.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# This script provisions a GitHub Actions runner on a Rocky or CentOS system. +# It performs the following steps: +# 1. Checks the operating system from /etc/os-release. +# 2. Verifies if the operating system is either Rocky or CentOS. +# 3. Checks if an actions-runner process is already running for the current user. +# 4. Copies the actions-runner tar file from a specified directory to the home directory. +# 5. Extracts the tar file and starts the actions-runner in the background. +# +# The actions-runner tar file contains the necessary binaries and scripts to run +# a GitHub Actions runner. It is specific to the operating system and is expected +# to be located in the /contrib/${CI_USER}/SETUP/ directory. + +CI_USER="Terry.McGuinness" + +# Get the Operating System name from /etc/os-release +OS_NAME=$(grep -E '^ID=' /etc/os-release | sed -E 's/ID="?([^"]*)"?/\1/') || true + +# Check if the OS is Rocky or CentOS +if [[ "${OS_NAME}" == "rocky" || "${OS_NAME}" == "centos" ]]; then + echo "Operating System is ${OS_NAME}" +else + echo "Unsupported Operating System: ${OS_NAME}" + exit 1 +fi + +running=$(pgrep -u "${USER}" run-helper -c) || true +if [[ "${running}" -gt 0 ]]; then + echo "actions-runner is already running" + exit +fi + +cp "/contrib/${CI_USER}/SETUP/actions-runner_${OS_NAME}.tar.gz" "${HOME}" +cd "${HOME}" || exit +tar -xf "actions-runner_${OS_NAME}.tar.gz" +cd actions-runner || exit +d=$(date +%Y-%m-%d-%H:%M) +nohup ./run.sh >& "run_nohup${d}.log" &