Skip to content

2024 modelling data update #3436

2024 modelling data update

2024 modelling data update #3436

name: build-and-test-dbt
on:
pull_request:
branches: [master]
push:
branches: [master]
workflow_dispatch:
inputs:
models:
required: true
description: A space-separated list of dbt models
default: 'default.vw_pin_sale default.vw_pin_universe'
type: string
jobs:
build-and-test-dbt:
runs-on: ubuntu-latest
# These permissions are needed to interact with GitHub's OIDC Token endpoint
# so that we can authenticate with AWS
permissions:
id-token: write
contents: read
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup dbt
uses: ./.github/actions/setup_dbt
with:
role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
role-duration-seconds: 14400 # Worst-case time for full build
- name: Restore dbt state cache
id: cache
uses: ./.github/actions/restore_dbt_cache
with:
path: ${{ env.PROJECT_DIR }}/${{ env.STATE_DIR }}
key: ${{ env.CACHE_KEY }}
- name: Set command args for building/testing resources
run: |
if [[ $CACHE_HIT == 'true' ]]; then
echo "MODIFIED_RESOURCES_ONLY=true" >> "$GITHUB_ENV"
fi
if [[ $EVENT_NAME == 'workflow_dispatch' ]]; then
echo "MANUALLY_DISPATCHED=true" >> "$GITHUB_ENV"
fi
shell: bash
env:
CACHE_HIT: ${{ steps.cache.outputs.cache-hit }}
EVENT_NAME: ${{ github.event_name }}
- name: Test dbt macros
run: dbt run-operation test_all
working-directory: ${{ env.PROJECT_DIR }}
shell: bash
- name: Deploy model dependencies
run: ../.github/scripts/deploy_dbt_model_dependencies.sh
working-directory: ${{ env.PROJECT_DIR }}
shell: bash
- name: Build models
run: |
if [[ $MODIFIED_RESOURCES_ONLY == 'true' ]]; then
if [[ $MANUALLY_DISPATCHED == 'true' ]]; then
echo "Running build on manually selected resources"
dbt build -t "$TARGET" -s ${{ inputs.models }} --defer --state "$STATE_DIR" --resource-types model seed
elif [[ $TARGET == 'prod' ]]; then
# Rebuild children of modified resources in prod, since schema
# changes might cause those children to become stale.
# Note that we only rebuild children if we're in prod, since
# otherwise we might end up running unnecessary builds of
# big downstream tables on every commit. We also only do this for
# modified resources and not new resources, since new resources
# can't accidentally modify downstream model schemas without
# those child models _also_ being modified
echo "Running build on modified/new resources and their children"
dbt build -t "$TARGET" -s state:modified+ state:new --state "$STATE_DIR" --resource-types model seed
else
echo "Running build on modified/new resources only"
dbt build -t "$TARGET" -s state:modified state:new --defer --state "$STATE_DIR" --resource-types model seed
fi
else
echo "Running build on all resources"
dbt build -t "$TARGET" --resource-types model seed
fi
working-directory: ${{ env.PROJECT_DIR }}
shell: bash
- name: Test models
run: |
if [[ $MODIFIED_RESOURCES_ONLY == 'true' ]]; then
if [[ $MANUALLY_DISPATCHED == 'true' ]]; then
echo "Running tests on manually selected resources"
dbt test -t "$TARGET" -s ${{ inputs.models }} --exclude "tag:data_test_iasworld" --defer --state "$STATE_DIR"
else
echo "Running tests on modified/new resources only"
dbt test -t "$TARGET" -s state:modified state:new --exclude "tag:data_test_iasworld" --defer --state "$STATE_DIR"
fi
else
echo "Running tests on all resources"
dbt test -t "$TARGET" --exclude "tag:data_test_iasworld"
fi
working-directory: ${{ env.PROJECT_DIR }}
shell: bash
- if: github.ref == 'refs/heads/master' && success()
name: Update dbt state cache
uses: ./.github/actions/save_dbt_cache
with:
path: ${{ env.PROJECT_DIR }}/${{ env.TARGET_DIR }}
key: ${{ env.CACHE_KEY }}