Run SC Crawler for spot prices #29057
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run SC Crawler for spot prices | |
on: | |
schedule: | |
- cron: "0 * * * *" | |
- cron: "5-55/5 * * * *" | |
workflow_dispatch: | |
jobs: | |
crawl: | |
permissions: | |
# for the stefanzweifel/git-auto-commit-action step | |
contents: write | |
runs-on: ubuntu-latest | |
concurrency: | |
group: ${{ github.workflow }} | |
env: | |
BUCKET: sc-data-public-40e9d310 | |
FILE: sc-data-all.db | |
DB_HOST: ${{ secrets.DB_HOST }} | |
DB_PORT: ${{ secrets.DB_PORT }} | |
DB_USER: ${{ secrets.DB_USER }} | |
DB_NAME: ${{ secrets.DB_NAME }} | |
DB_REGION: ${{ secrets.DB_REGION }} | |
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} | |
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} | |
AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} | |
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} | |
COLUMNS: 160 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: "3.10" | |
- name: Cache uv | |
uses: actions/cache@v4 | |
with: | |
key: uv | |
path: ~/.cache/uv | |
- name: Install uv | |
uses: yezz123/setup-uv@v4 | |
with: | |
uv-venv: "sc_data" | |
- name: Set up Python dependencies | |
run: | | |
uv pip install psycopg2 | |
uv pip install "sparecores-crawler[all] @ git+https://github.com/SpareCores/sc-crawler" | |
uv pip install -e . | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: us-east-1 | |
- name: Configure GCP credentials | |
uses: google-github-actions/auth@v2 | |
with: | |
credentials_json: '${{ secrets.GCP_SERVICE_ACCOUNT_JSON_KEY }}' | |
- name: Load cached SQLite file | |
id: cache-sqlite-restore | |
uses: actions/cache/restore@v4 | |
with: | |
key: sc-data-current-sqlite | |
path: sc-data-all.db | |
- name: Get connection string for remote database | |
run: | | |
PGPASSWORD="$( aws --region ${DB_REGION} rds generate-db-auth-token --hostname ${DB_HOST} --port ${DB_PORT} --username ${DB_USER} )" | |
# URL-encode the IAM RDS token | |
DB_PASS_ENCODED="$( jq -rn --arg x ${PGPASSWORD} '$x|@uri' )" | |
REMOTE_DB="postgresql://${DB_USER}:${DB_PASS_ENCODED}@${DB_HOST}:${DB_PORT}/${DB_NAME}" | |
echo "::add-mask::$PGPASSWORD" | |
echo "PGPASSWORD=${PGPASSWORD}" >> "$GITHUB_ENV" | |
echo "REMOTE_DB=${REMOTE_DB}" >> "$GITHUB_ENV" | |
- name: Get current database hourly or if not cached | |
if: github.event.schedule == '0 * * * *' || steps.cache-sqlite-restore.outputs.cache-hit != 'true' | |
run: | | |
rm -f ${FILE} | |
sc-crawler copy --source ${REMOTE_DB} --target sqlite:///${FILE} | |
- name: Run SC Crawler on all resources once an hour or on manual request | |
if: github.event.schedule == '0 * * * *' || github.event_name == 'workflow_dispatch' | |
run: sc-crawler pull --connection-string sqlite:///${FILE} | |
- name: Run SC Crawler only for spot prices | |
if: github.event.schedule == '5-55/5 * * * *' | |
# Azure Retail Pricing API is super slow, let's check on spot prices only in the hourly run | |
run: sc-crawler pull --connection-string sqlite:///${FILE} --include-records server_prices_spot --exclude-vendor azure | |
- name: Get connection string for remote database again (crawling might take 15+ mins) | |
run: | | |
PGPASSWORD="$( aws --region ${DB_REGION} rds generate-db-auth-token --hostname ${DB_HOST} --port ${DB_PORT} --username ${DB_USER} )" | |
# URL-encode the IAM RDS token | |
DB_PASS_ENCODED="$( jq -rn --arg x ${PGPASSWORD} '$x|@uri' )" | |
REMOTE_DB="postgresql://${DB_USER}:${DB_PASS_ENCODED}@${DB_HOST}:${DB_PORT}/${DB_NAME}" | |
echo "::add-mask::$PGPASSWORD" | |
echo "PGPASSWORD=${PGPASSWORD}" >> "$GITHUB_ENV" | |
echo "REMOTE_DB=${REMOTE_DB}" >> "$GITHUB_ENV" | |
- name: Sync back (full) | |
if: github.event.schedule == '0 * * * *' || github.event_name == 'workflow_dispatch' | |
run: | | |
sc-crawler sync --source sqlite:///${FILE} --target ${REMOTE_DB} --scd --log-changes-path changes.log --log-changes-tables country --log-changes-tables compliance_framework --log-changes-tables vendor --log-changes-tables vendor_compliance_link --log-changes-tables region --log-changes-tables zone --log-changes-tables storage --log-changes-tables server --log-changes-tables benchmark | |
psql "host=${DB_HOST} port=${DB_PORT} dbname=${DB_NAME} user=${DB_USER} sslmode=require" -c "VACUUM ANALYZE" | |
- name: Sync back (server prices) | |
if: github.event.schedule == '5-55/5 * * * *' | |
run: | | |
sc-crawler sync --source sqlite:///${FILE} --target ${REMOTE_DB} --scd --sync-tables server_price | |
touch changes.log | |
- name: Upload if changed | |
env: | |
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} | |
DBHUB_TOKEN: ${{ secrets.DBHUB_TOKEN }} | |
run: | | |
echo "Updating $FILE..." | |
RHASH=$(aws s3api head-object --bucket $BUCKET --key $FILE.bz2 --query "Metadata.hash" --output=text) | |
echo "Last known hash in S3: $RHASH" | |
LHASH=$(sc-crawler hash --connection-string sqlite:///$FILE) | |
echo "Currently pulled database's hash: $LHASH" | |
if [ "$LHASH" != "$RHASH" ]; then | |
bzip2 -k -9 "$FILE" | |
aws s3 cp "$FILE".bz2 "s3://$BUCKET/$FILE".bz2 --metadata="hash=$LHASH" | |
fi | |
FILE="sc-data-priceless.db" | |
echo "Truncating $FILE..." | |
cp sc-data-all.db $FILE | |
sqlite3 $FILE " | |
DELETE FROM server_price; | |
DELETE FROM storage_price; | |
DELETE FROM traffic_price; | |
DELETE FROM ipv4_price; | |
DELETE FROM benchmark_score; | |
VACUUM;" | |
echo "Updating $FILE..." | |
RHASH=$(aws s3api head-object --bucket $BUCKET --key $FILE.bz2 --query "Metadata.hash" --output=text) | |
echo "Last known hash in S3: $RHASH" | |
LHASH=$(sc-crawler hash --connection-string sqlite:///$FILE) | |
echo "Currently pulled database's hash: $LHASH" | |
if [ "$LHASH" != "$RHASH" ]; then | |
bzip2 -k -9 "$FILE" | |
# update S3 | |
aws s3 cp "$FILE".bz2 "s3://$BUCKET/$FILE".bz2 --metadata="hash=$LHASH" | |
# update DBHUB | |
GHARUNURL=$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID | |
COMMIT=$(curl -s --fail-with-body -F apikey=${DBHUB_TOKEN} \ | |
-F dbowner=sparecores -F dbname=sc-data-priceless.db \ | |
https://api.dbhub.io/v1/commits | jq -r "to_entries|sort_by(.value.timestamp)|last|.key") | |
echo "Most recent commit: $COMMIT" | |
curl --fail-with-body -F apikey=$DBHUB_TOKEN -F dbname=$FILE \ | |
-F commitmsg="SC Crawler run on $(date '+%Y-%m-%d %H:%M:%S')" \ | |
-F commit=$COMMIT \ | |
-F "sourceurl=$GHARUNURL" \ | |
-F file=@$FILE -F "branch=main" https://api.dbhub.io/v1/upload | |
# note on Slack | |
curl -X POST -H 'Content-type: application/json' \ | |
--data "{\"text\": \"$FILE was <$GHARUNURL|updated> on S3/DBHUB with the following hash: $LHASH\"}" \ | |
$SLACK_WEBHOOK | |
# update CHANGELOG | |
SCDATAV=$(python -c "from importlib.metadata import version; print(version('sparecores-data'))") | |
DATE=$(date '+%Y%m%d%H%M%S') | |
NEWVERSION="v$SCDATAV+$DATE" | |
NEWVERSIONDATE="$NEWVERSION ($(date '+%b %d, %Y'))" | |
echo "## ${NEWVERSIONDATE}" > CHANGELOG.new | |
CHANGESFOUND="\nThis is an automated release with $(cat changes.log| grep '^-'|wc -l) new/updated/deleted resources.\nJob was run at ${GHARUNURL}" | |
echo -e "${CHANGESFOUND}" >> CHANGELOG.new | |
echo "" >> CHANGELOG.new | |
cat changes.log >> CHANGELOG.new | |
echo "" >> CHANGELOG.new | |
cat CHANGELOG.md >> CHANGELOG.new | |
mv CHANGELOG.new CHANGELOG.md | |
# pass required env vars and files for later steps | |
cp sc-data-priceless.db src/sc_data/data/sc-data-priceless.db | |
echo $LHASH > src/sc_data/data/db_hash | |
echo "CHANGESFOUND=${CHANGESFOUND}" >> "$GITHUB_ENV" | |
echo "GHARUNURL=${GHARUNURL}" >> "$GITHUB_ENV" | |
echo "NEWVERSION=${NEWVERSION}" >> "$GITHUB_ENV" | |
echo "NEWVERSIONDATE=${NEWVERSIONDATE}" >> "$GITHUB_ENV" | |
fi | |
- name: Clear SQLite file cache | |
if: steps.cache-sqlite-restore.outputs.cache-hit == 'true' | |
env: | |
GH_TOKEN: ${{ secrets.GH_TOKEN }} | |
run: gh cache delete sc-data-current-sqlite | |
- name: Save SQLite file | |
id: cache-sqlite-save | |
uses: actions/cache/save@v4 | |
with: | |
key: sc-data-current-sqlite | |
path: sc-data-all.db | |
- name: Generate the commit message | |
id: generate-commit-message | |
run: | | |
echo "tag=$NEWVERSION" >> $GITHUB_OUTPUT | |
echo 'message<<EOF' >> $GITHUB_OUTPUT | |
echo -e "$CHANGESFOUND\n\n" >> $GITHUB_OUTPUT | |
cat changes.log >> $GITHUB_OUTPUT | |
echo 'EOF' >> $GITHUB_OUTPUT | |
git pull | |
- name: Commit priceless SQLite changes to the repo | |
uses: stefanzweifel/git-auto-commit-action@v5 | |
id: auto-commit-action | |
with: | |
commit_message: ${{ steps.generate-commit-message.outputs.message }} | |
file_pattern: "CHANGELOG.md src/sc_data/data/*" | |
tagging_message: ${{ steps.generate-commit-message.outputs.tag }} | |
commit_author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" | |
- name: Make release after new changes | |
uses: softprops/action-gh-release@v2 | |
if: steps.auto-commit-action.outputs.changes_detected == 'true' | |
with: | |
tag_name: ${{ steps.generate-commit-message.outputs.tag }} | |
name: ${{env.NEWVERSIONDATE}} | |
body: ${{ steps.generate-commit-message.outputs.message }} | |
- name: Archive priceless | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: priceless-data-and-changes | |
compression-level: 0 | |
path: | | |
changes.log | |
sc-data-priceless.db.bz2 | |
- name: Notify monitoring that the job run | |
run: curl -s https://uptime.betterstack.com/api/v1/heartbeat/NTnwVC261MyrLnevn4WJc5Jh | |
workflow-keepalive: | |
if: github.event.schedule == '0 * * * *' | |
runs-on: ubuntu-latest | |
permissions: | |
actions: write | |
steps: | |
- uses: liskin/gh-workflow-keepalive@v1 | |
with: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |