Skip to content

Run SC Crawler

Run SC Crawler #298

Workflow file for this run

name: Run SC Crawler
on:
schedule:
- cron: "0 8 * * *"
workflow_dispatch:
jobs:
crawl:
if: ${{ false }} # disabled for the SCD-based crawler
runs-on: ubuntu-latest
env:
BUCKET: sc-data-public-40e9d310
FILE: sc-data-all.db
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Set up Python dependencies
run: pip install "sc_crawler[all] @ git+https://github.com/SpareCores/sc-crawler"
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Run SC Crawler
run: sc-crawler pull --connection-string sqlite:///$FILE --include-vendor aws
- name: Upload if changed
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
DBHUB_TOKEN: ${{ secrets.DBHUB_TOKEN }}
run: |
echo "Updating $FILE..."
RHASH=$(aws s3api head-object --bucket $BUCKET --key $FILE.bz2 --query "Metadata.hash" --output=text)
echo "Last known hash in S3: $RHASH"
LHASH=$(sc-crawler hash --connection-string sqlite:///$FILE)
echo "Currently pulled database's hash: $LHASH"
if [ "$LHASH" != "$RHASH" ]; then
bzip2 -k -9 "$FILE"
aws s3 cp "$FILE".bz2 "s3://$BUCKET/$FILE".bz2 --metadata="hash=$LHASH"
fi
FILE="sc-data-priceless.db"
echo "Truncating $FILE..."
mv sc-data-all.db $FILE
sqlite3 $FILE "
DELETE FROM server_price;
DELETE FROM storage_price;
DELETE FROM traffic_price;
DELETE FROM ipv4_price;
VACUUM;"
echo "Updating $FILE..."
RHASH=$(aws s3api head-object --bucket $BUCKET --key $FILE.bz2 --query "Metadata.hash" --output=text)
echo "Last known hash in S3: $RHASH"
LHASH=$(sc-crawler hash --connection-string sqlite:///$FILE)
echo "Currently pulled database's hash: $LHASH"
if [ "$LHASH" != "$RHASH" ]; then
bzip2 -k -9 "$FILE"
# update S3
aws s3 cp "$FILE".bz2 "s3://$BUCKET/$FILE".bz2 --metadata="hash=$LHASH"
# update DBHUB
COMMIT=$(curl -s --fail-with-body -F apikey=${DBHUB_TOKEN} \
-F dbowner=sparecores -F dbname=sc-data-priceless.db \
https://api.dbhub.io/v1/commits | jq -r "to_entries|sort_by(.value.timestamp)|last|.key")
echo "Most recent commit: $COMMIT"
curl --fail-with-body -F apikey=$DBHUB_TOKEN -F dbname=$FILE \
-F commitmsg="SC Crawler run on $(date '+%Y-%m-%d %H:%M:%S')" \
-F commit=$COMMIT \
-F "sourceurl=$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" \
-F file=@$FILE -F "branch=main" https://api.dbhub.io/v1/upload
# note on Slack
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\": \"$FILE was updated on S3 with the following hash: $LHASH\"}" \
$SLACK_WEBHOOK
fi