Skip to content

Run SC Crawler for spot prices #29044

Run SC Crawler for spot prices

Run SC Crawler for spot prices #29044

Workflow file for this run

name: Run SC Crawler for spot prices
on:
schedule:
- cron: "0 * * * *"
- cron: "5-55/5 * * * *"
workflow_dispatch:
jobs:
crawl:
permissions:
# for the stefanzweifel/git-auto-commit-action step
contents: write
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}
env:
BUCKET: sc-data-public-40e9d310
FILE: sc-data-all.db
DB_HOST: ${{ secrets.DB_HOST }}
DB_PORT: ${{ secrets.DB_PORT }}
DB_USER: ${{ secrets.DB_USER }}
DB_NAME: ${{ secrets.DB_NAME }}
DB_REGION: ${{ secrets.DB_REGION }}
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
COLUMNS: 160
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Cache uv
uses: actions/cache@v4
with:
key: uv
path: ~/.cache/uv
- name: Install uv
uses: yezz123/setup-uv@v4
with:
uv-venv: "sc_data"
- name: Set up Python dependencies
run: |
uv pip install psycopg2
uv pip install "sparecores-crawler[all] @ git+https://github.com/SpareCores/sc-crawler"
uv pip install -e .
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Configure GCP credentials
uses: google-github-actions/auth@v2
with:
credentials_json: '${{ secrets.GCP_SERVICE_ACCOUNT_JSON_KEY }}'
- name: Load cached SQLite file
id: cache-sqlite-restore
uses: actions/cache/restore@v4
with:
key: sc-data-current-sqlite
path: sc-data-all.db
- name: Get connection string for remote database
run: |
PGPASSWORD="$( aws --region ${DB_REGION} rds generate-db-auth-token --hostname ${DB_HOST} --port ${DB_PORT} --username ${DB_USER} )"
# URL-encode the IAM RDS token
DB_PASS_ENCODED="$( jq -rn --arg x ${PGPASSWORD} '$x|@uri' )"
REMOTE_DB="postgresql://${DB_USER}:${DB_PASS_ENCODED}@${DB_HOST}:${DB_PORT}/${DB_NAME}"
echo "::add-mask::$PGPASSWORD"
echo "PGPASSWORD=${PGPASSWORD}" >> "$GITHUB_ENV"
echo "REMOTE_DB=${REMOTE_DB}" >> "$GITHUB_ENV"
- name: Get current database hourly or if not cached
if: github.event.schedule == '0 * * * *' || steps.cache-sqlite-restore.outputs.cache-hit != 'true'
run: |
rm -f ${FILE}
sc-crawler copy --source ${REMOTE_DB} --target sqlite:///${FILE}
- name: Run SC Crawler on all resources once an hour or on manual request
if: github.event.schedule == '0 * * * *' || github.event_name == 'workflow_dispatch'
run: sc-crawler pull --connection-string sqlite:///${FILE}
- name: Run SC Crawler only for spot prices
if: github.event.schedule == '5-55/5 * * * *'
# Azure Retail Pricing API is super slow, let's check on spot prices only in the hourly run
run: sc-crawler pull --connection-string sqlite:///${FILE} --include-records server_prices_spot --exclude-vendor azure
- name: Get connection string for remote database again (crawling might take 15+ mins)
run: |
PGPASSWORD="$( aws --region ${DB_REGION} rds generate-db-auth-token --hostname ${DB_HOST} --port ${DB_PORT} --username ${DB_USER} )"
# URL-encode the IAM RDS token
DB_PASS_ENCODED="$( jq -rn --arg x ${PGPASSWORD} '$x|@uri' )"
REMOTE_DB="postgresql://${DB_USER}:${DB_PASS_ENCODED}@${DB_HOST}:${DB_PORT}/${DB_NAME}"
echo "::add-mask::$PGPASSWORD"
echo "PGPASSWORD=${PGPASSWORD}" >> "$GITHUB_ENV"
echo "REMOTE_DB=${REMOTE_DB}" >> "$GITHUB_ENV"
- name: Sync back (full)
if: github.event.schedule == '0 * * * *' || github.event_name == 'workflow_dispatch'
run: |
sc-crawler sync --source sqlite:///${FILE} --target ${REMOTE_DB} --scd --log-changes-path changes.log --log-changes-tables country --log-changes-tables compliance_framework --log-changes-tables vendor --log-changes-tables vendor_compliance_link --log-changes-tables region --log-changes-tables zone --log-changes-tables storage --log-changes-tables server --log-changes-tables benchmark
psql "host=${DB_HOST} port=${DB_PORT} dbname=${DB_NAME} user=${DB_USER} sslmode=require" -c "VACUUM ANALYZE"
- name: Sync back (server prices)
if: github.event.schedule == '5-55/5 * * * *'
run: |
sc-crawler sync --source sqlite:///${FILE} --target ${REMOTE_DB} --scd --sync-tables server_price
touch changes.log
- name: Upload if changed
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
DBHUB_TOKEN: ${{ secrets.DBHUB_TOKEN }}
run: |
echo "Updating $FILE..."
RHASH=$(aws s3api head-object --bucket $BUCKET --key $FILE.bz2 --query "Metadata.hash" --output=text)
echo "Last known hash in S3: $RHASH"
LHASH=$(sc-crawler hash --connection-string sqlite:///$FILE)
echo "Currently pulled database's hash: $LHASH"
if [ "$LHASH" != "$RHASH" ]; then
bzip2 -k -9 "$FILE"
aws s3 cp "$FILE".bz2 "s3://$BUCKET/$FILE".bz2 --metadata="hash=$LHASH"
fi
FILE="sc-data-priceless.db"
echo "Truncating $FILE..."
cp sc-data-all.db $FILE
sqlite3 $FILE "
DELETE FROM server_price;
DELETE FROM storage_price;
DELETE FROM traffic_price;
DELETE FROM ipv4_price;
DELETE FROM benchmark_score;
VACUUM;"
echo "Updating $FILE..."
RHASH=$(aws s3api head-object --bucket $BUCKET --key $FILE.bz2 --query "Metadata.hash" --output=text)
echo "Last known hash in S3: $RHASH"
LHASH=$(sc-crawler hash --connection-string sqlite:///$FILE)
echo "Currently pulled database's hash: $LHASH"
if [ "$LHASH" != "$RHASH" ]; then
bzip2 -k -9 "$FILE"
# update S3
aws s3 cp "$FILE".bz2 "s3://$BUCKET/$FILE".bz2 --metadata="hash=$LHASH"
# update DBHUB
GHARUNURL=$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID
COMMIT=$(curl -s --fail-with-body -F apikey=${DBHUB_TOKEN} \
-F dbowner=sparecores -F dbname=sc-data-priceless.db \
https://api.dbhub.io/v1/commits | jq -r "to_entries|sort_by(.value.timestamp)|last|.key")
echo "Most recent commit: $COMMIT"
curl --fail-with-body -F apikey=$DBHUB_TOKEN -F dbname=$FILE \
-F commitmsg="SC Crawler run on $(date '+%Y-%m-%d %H:%M:%S')" \
-F commit=$COMMIT \
-F "sourceurl=$GHARUNURL" \
-F file=@$FILE -F "branch=main" https://api.dbhub.io/v1/upload
# note on Slack
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\": \"$FILE was <$GHARUNURL|updated> on S3/DBHUB with the following hash: $LHASH\"}" \
$SLACK_WEBHOOK
# update CHANGELOG
SCDATAV=$(python -c "from importlib.metadata import version; print(version('sparecores-data'))")
DATE=$(date '+%Y%m%d%H%M%S')
NEWVERSION="v$SCDATAV+$DATE"
NEWVERSIONDATE="$NEWVERSION ($(date '+%b %d, %Y'))"
echo "## ${NEWVERSIONDATE}" > CHANGELOG.new
CHANGESFOUND="\nThis is an automated release with $(cat changes.log| grep '^-'|wc -l) new/updated/deleted resources.\nJob was run at ${GHARUNURL}"
echo -e "${CHANGESFOUND}" >> CHANGELOG.new
echo "" >> CHANGELOG.new
cat changes.log >> CHANGELOG.new
echo "" >> CHANGELOG.new
cat CHANGELOG.md >> CHANGELOG.new
mv CHANGELOG.new CHANGELOG.md
# pass required env vars and files for later steps
cp sc-data-priceless.db src/sc_data/data/sc-data-priceless.db
echo $LHASH > src/sc_data/data/db_hash
echo "CHANGESFOUND=${CHANGESFOUND}" >> "$GITHUB_ENV"
echo "GHARUNURL=${GHARUNURL}" >> "$GITHUB_ENV"
echo "NEWVERSION=${NEWVERSION}" >> "$GITHUB_ENV"
echo "NEWVERSIONDATE=${NEWVERSIONDATE}" >> "$GITHUB_ENV"
fi
- name: Clear SQLite file cache
if: steps.cache-sqlite-restore.outputs.cache-hit == 'true'
env:
GH_TOKEN: ${{ secrets.GH_TOKEN }}
run: gh cache delete sc-data-current-sqlite
- name: Save SQLite file
id: cache-sqlite-save
uses: actions/cache/save@v4
with:
key: sc-data-current-sqlite
path: sc-data-all.db
- name: Generate the commit message
id: generate-commit-message
run: |
echo "tag=$NEWVERSION" >> $GITHUB_OUTPUT
echo 'message<<EOF' >> $GITHUB_OUTPUT
echo -e "$CHANGESFOUND\n\n" >> $GITHUB_OUTPUT
cat changes.log >> $GITHUB_OUTPUT
echo 'EOF' >> $GITHUB_OUTPUT
git pull
- name: Commit priceless SQLite changes to the repo
uses: stefanzweifel/git-auto-commit-action@v5
id: auto-commit-action
with:
commit_message: ${{ steps.generate-commit-message.outputs.message }}
file_pattern: "CHANGELOG.md src/sc_data/data/*"
tagging_message: ${{ steps.generate-commit-message.outputs.tag }}
commit_author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
- name: Make release after new changes
uses: softprops/action-gh-release@v2
if: steps.auto-commit-action.outputs.changes_detected == 'true'
with:
tag_name: ${{ steps.generate-commit-message.outputs.tag }}
name: ${{env.NEWVERSIONDATE}}
body: ${{ steps.generate-commit-message.outputs.message }}
- name: Archive priceless
if: always()
uses: actions/upload-artifact@v4
with:
name: priceless-data-and-changes
compression-level: 0
path: |
changes.log
sc-data-priceless.db.bz2
- name: Notify monitoring that the job run
run: curl -s https://uptime.betterstack.com/api/v1/heartbeat/NTnwVC261MyrLnevn4WJc5Jh
workflow-keepalive:
if: github.event.schedule == '0 * * * *'
runs-on: ubuntu-latest
permissions:
actions: write
steps:
- uses: liskin/gh-workflow-keepalive@v1
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}