Migrate update-s3-html job to test-infra (#4879)

pytorch · Jan 15, 2024 · ca6be2a · ca6be2a
1 parent 8acbaa9
commit ca6be2a
Show file tree

Hide file tree

Showing 5 changed files with 631 additions and 0 deletions.
diff --git a/.github/workflows/update-s3-html.yml b/.github/workflows/update-s3-html.yml
@@ -0,0 +1,45 @@
+name: Update S3 HTML indices for download.pytorch.org
+
+on:
+  schedule:
+    # Update the indices every 30 minutes
+    - cron: "*/30 * * * *"
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  update:
+    runs-on: ubuntu-22.04
+    environment: pytorchbot-env
+    strategy:
+      matrix:
+        prefix: ["whl", "whl/test", "whl/nightly", "whl/lts/1.8"]
+      fail-fast: False
+    container:
+      image: continuumio/miniconda3:4.12.0
+    steps:
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_update
+          aws-region: us-east-1
+      - name: Checkout repository test-infra
+        uses: actions/checkout@v3
+        with:
+          repository: pytorch/test-infra
+          ref: ${{ github.ref }}
+      - name: Update s3 html index
+        run: |
+            set -ex
+
+            # Create Conda Environment
+            conda create --quiet -y --prefix run_env python="3.8"
+            conda activate ./run_env
+
+            # Install requirements
+            pip install -r s3_management/requirements.txt
+            python s3_management/manage.py --generate-pep503 ${{ matrix.prefix }}
diff --git a/s3_management/README.md b/s3_management/README.md
@@ -0,0 +1,3 @@
+# s3_management
+
+This directory houses scripts to maintain the s3 HTML indices for https://download.pytorch.org/whl
diff --git a/s3_management/backup_conda.py b/s3_management/backup_conda.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# Downloads domain pytorch and library packages from channel
+# And backs them up to S3
+# Do not use unless you know what you are doing
+# Usage:  python backup_conda.py --version 1.6.0
+
+import boto3
+from typing import List, Optional
+import conda.api
+import urllib
+import os
+import hashlib
+import argparse
+
+S3 = boto3.resource('s3')
+BUCKET = S3.Bucket('pytorch-backup')
+_known_subdirs = ["linux-64", "osx-64", "osx-arm64", "win-64"]
+
+
+def compute_md5(path:str) -> str:
+    with open(path, "rb") as f:
+        return hashlib.md5(f.read()).hexdigest()
+
+
+def download_conda_package(package:str, version:Optional[str] = None,
+                           depends:Optional[str] = None, channel:Optional[str] = None) -> List[str]:
+    packages = conda.api.SubdirData.query_all(package,
+                                              channels = [channel] if channel is not None else None,
+                                              subdirs = _known_subdirs)
+    rc = []
+
+    for pkg in packages:
+        if version is not None and pkg.version != version:
+            continue
+        if depends is not None and depends not in pkg.depends:
+            continue
+
+        print(f"Downloading {pkg.url}...")
+        os.makedirs(pkg.subdir, exist_ok = True)
+        fname = f"{pkg.subdir}/{pkg.fn}"
+        if not os.path.exists(fname):
+            with open(fname, "wb") as f, urllib.request.urlopen(pkg.url) as url:
+                f.write(url.read())
+        if compute_md5(fname) != pkg.md5:
+            print(f"md5 of {fname} is {compute_md5(fname)} does not match {pkg.md5}")
+            continue
+        rc.append(fname)
+
+    return rc
+
+def upload_to_s3(prefix: str, fnames: List[str]) -> None:
+    for fname in fnames:
+        BUCKET.upload_file(fname, f"{prefix}/{fname}")
+        print(fname)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--version",
+        help="PyTorch Version to backup",
+        type=str,
+        required = True
+    )
+    options = parser.parse_args()
+    rc = download_conda_package("pytorch", channel = "pytorch", version = options.version)
+    upload_to_s3(f"v{options.version}/conda", rc)
+
+    for libname in ["torchvision", "torchaudio", "torchtext"]:
+        print(f"processing {libname}")
+        rc = download_conda_package(libname, channel = "pytorch", depends = f"pytorch {options.version}")
+        upload_to_s3(f"v{options.version}/conda", rc)