From c239b79354bf750d6523d61ac8910d04a09c3990 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Wed, 24 Jul 2024 11:31:19 -0600
Subject: [PATCH 1/5] apple: update to official pytorch 2.4 release

---
 install/apple/poetry.lock    | 166 +++++++++++++++++------------------
 install/apple/pyproject.toml |   6 +-
 2 files changed, 83 insertions(+), 89 deletions(-)

diff --git a/install/apple/poetry.lock b/install/apple/poetry.lock
index 90170559..79bb0260 100644
--- a/install/apple/poetry.lock
+++ b/install/apple/poetry.lock
@@ -495,9 +495,9 @@ training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.6.0)", "pr
 
 [package.source]
 type = "git"
-url = "https://github.com/huggingface/diffusers"
-reference = "HEAD"
-resolved_reference = "a9c403c00197d8d6eb854128dda2f7849cedd100"
+url = "https://github.com/bghira/diffusers"
+reference = "feature/lavender-flow-complete"
+resolved_reference = "0d3fcdbca358c3b93c917f7fd7eac575fd95d898"
 
 [[package]]
 name = "dill"
@@ -836,20 +836,6 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link
 perf = ["ipython"]
 testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]
 
-[[package]]
-name = "intel-openmp"
-version = "2021.4.0"
-description = "Intel OpenMP* Runtime Library"
-optional = false
-python-versions = "*"
-files = [
-    {file = "intel_openmp-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:41c01e266a7fdb631a7609191709322da2bbf24b252ba763f125dd651bcc7675"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:3b921236a38384e2016f0f3d65af6732cf2c12918087128a9163225451e776f2"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:e2240ab8d01472fed04f3544a878cda5da16c26232b7ea1b59132dbfb48b186e"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:6e863d8fd3d7e8ef389d52cf97a50fe2afe1a19247e8c0d168ce021546f96fc9"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:eef4c8bcc8acefd7f5cd3b9384dbf73d59e2c99fc56545712ded913f43c4a94f"},
-]
-
 [[package]]
 name = "iterutils"
 version = "0.1.6"
@@ -996,24 +982,6 @@ files = [
     {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
 ]
 
-[[package]]
-name = "mkl"
-version = "2021.4.0"
-description = "Intel® oneAPI Math Kernel Library"
-optional = false
-python-versions = "*"
-files = [
-    {file = "mkl-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:67460f5cd7e30e405b54d70d1ed3ca78118370b65f7327d495e9c8847705e2fb"},
-    {file = "mkl-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:636d07d90e68ccc9630c654d47ce9fdeb036bb46e2b193b3a9ac8cfea683cce5"},
-    {file = "mkl-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:398dbf2b0d12acaf54117a5210e8f191827f373d362d796091d161f610c1ebfb"},
-    {file = "mkl-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:439c640b269a5668134e3dcbcea4350459c4a8bc46469669b2d67e07e3d330e8"},
-    {file = "mkl-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:ceef3cafce4c009dd25f65d7ad0d833a0fbadc3d8903991ec92351fe5de1e718"},
-]
-
-[package.dependencies]
-intel-openmp = "==2021.*"
-tbb = "==2021.*"
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -1259,12 +1227,13 @@ files = [
 
 [[package]]
 name = "nvidia-cudnn-cu12"
-version = "8.9.2.26"
+version = "9.1.0.70"
 description = "cuDNN runtime libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9"},
+    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"},
+    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"},
 ]
 
 [package.dependencies]
@@ -2404,19 +2373,6 @@ files = [
 [package.dependencies]
 mpmath = ">=0.19"
 
-[[package]]
-name = "tbb"
-version = "2021.12.0"
-description = "Intel® oneAPI Threading Building Blocks (oneTBB)"
-optional = false
-python-versions = "*"
-files = [
-    {file = "tbb-2021.12.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:f2cc9a7f8ababaa506cbff796ce97c3bf91062ba521e15054394f773375d81d8"},
-    {file = "tbb-2021.12.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:a925e9a7c77d3a46ae31c34b0bb7f801c4118e857d137b68f68a8e458fcf2bd7"},
-    {file = "tbb-2021.12.0-py3-none-win32.whl", hash = "sha256:b1725b30c174048edc8be70bd43bb95473f396ce895d91151a474d0fa9f450a8"},
-    {file = "tbb-2021.12.0-py3-none-win_amd64.whl", hash = "sha256:fc2772d850229f2f3df85f1109c4844c495a2db7433d38200959ee9265b34789"},
-]
-
 [[package]]
 name = "tensorboard"
 version = "2.16.2"
@@ -2587,34 +2543,43 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
 
 [[package]]
 name = "torch"
-version = "2.4.0.dev20240511"
+version = "2.4.0"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.4.0.dev20240511-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5c39786ed1fb10807a4a9a470fc9ba9f31f2e8c2698b637646c9eec1e63cac9"},
-    {file = "torch-2.4.0.dev20240511-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:e08fdf100660bc98e89d575aac51aca58c3169c15d09451180415ae0642f033f"},
-    {file = "torch-2.4.0.dev20240511-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78e824f60214118e813f9ade7ef1452f34230bd34fa49593c6feacefeb6d4d50"},
-    {file = "torch-2.4.0.dev20240511-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ea48bdaf0da647938961d76ac6921acfe264343199417bf5de8afbe0a4318fc0"},
-    {file = "torch-2.4.0.dev20240511-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7eb1ed96372840cf620dbb3fc779a53f2a4a69d36473f93d1a1771c356696e59"},
-    {file = "torch-2.4.0.dev20240511-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:5afd150809c9bf0784bd604d76ce1bfb4e3e727aac0e814a229e87f8e4f8ae4f"},
-    {file = "torch-2.4.0.dev20240511-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ac5080032a28612e1d07e29106f703a8b6956e5b779cf7ba1cf77e39591e698"},
-    {file = "torch-2.4.0.dev20240511-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:e8f57c653400c21b61f0a5c127093480de6fea3abff31c24be38a876f20c386e"},
-    {file = "torch-2.4.0.dev20240511-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:524de1132e22f3a6767089cddba89dca2edebd27e7301d9dc91b12f0b1bb30fe"},
-    {file = "torch-2.4.0.dev20240511-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:22f07be89eae0e0e966d96d16ba21e18f70bbfbbfbbf46c79e8cbb1b3abb7f2c"},
+    {file = "torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:4ed94583e244af51d6a8d28701ca5a9e02d1219e782f5a01dd401f90af17d8ac"},
+    {file = "torch-2.4.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:c4ca297b7bd58b506bfd6e78ffd14eb97c0e7797dcd7965df62f50bb575d8954"},
+    {file = "torch-2.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:2497cbc7b3c951d69b276ca51fe01c2865db67040ac67f5fc20b03e41d16ea4a"},
+    {file = "torch-2.4.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:685418ab93730efbee71528821ff54005596970dd497bf03c89204fb7e3f71de"},
+    {file = "torch-2.4.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e743adadd8c8152bb8373543964551a7cb7cc20ba898dc8f9c0cdbe47c283de0"},
+    {file = "torch-2.4.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:7334325c0292cbd5c2eac085f449bf57d3690932eac37027e193ba775703c9e6"},
+    {file = "torch-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:97730014da4c57ffacb3c09298c6ce05400606e890bd7a05008d13dd086e46b1"},
+    {file = "torch-2.4.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f169b4ea6dc93b3a33319611fcc47dc1406e4dd539844dcbd2dec4c1b96e166d"},
+    {file = "torch-2.4.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:997084a0f9784d2a89095a6dc67c7925e21bf25dea0b3d069b41195016ccfcbb"},
+    {file = "torch-2.4.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:bc3988e8b36d1e8b998d143255d9408d8c75da4ab6dd0dcfd23b623dfb0f0f57"},
+    {file = "torch-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:3374128bbf7e62cdaed6c237bfd39809fbcfaa576bee91e904706840c3f2195c"},
+    {file = "torch-2.4.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:91aaf00bfe1ffa44dc5b52809d9a95129fca10212eca3ac26420eb11727c6288"},
+    {file = "torch-2.4.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cc30457ea5489c62747d3306438af00c606b509d78822a88f804202ba63111ed"},
+    {file = "torch-2.4.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a046491aaf96d1215e65e1fa85911ef2ded6d49ea34c8df4d0638879f2402eef"},
+    {file = "torch-2.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:688eec9240f3ce775f22e1e1a5ab9894f3d5fe60f3f586deb7dbd23a46a83916"},
+    {file = "torch-2.4.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:3af4de2a618fb065e78404c4ba27a818a7b7957eaeff28c6c66ce7fb504b68b8"},
+    {file = "torch-2.4.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:618808d3f610d5f180e47a697d4ec90b810953bb1e020f424b2ac7fb0884b545"},
+    {file = "torch-2.4.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:ed765d232d23566052ba83632ec73a4fccde00b4c94ad45d63b471b09d63b7a7"},
+    {file = "torch-2.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:a2feb98ac470109472fb10dfef38622a7ee08482a16c357863ebc7bc7db7c8f7"},
+    {file = "torch-2.4.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8940fc8b97a4c61fdb5d46a368f21f4a3a562a17879e932eb51a5ec62310cb31"},
 ]
 
 [package.dependencies]
 filelock = "*"
 fsspec = "*"
 jinja2 = "*"
-mkl = {version = ">=2021.1.1,<=2021.4.0", markers = "platform_system == \"Windows\""}
 networkx = "*"
 nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cudnn-cu12 = {version = "8.9.2.26", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
@@ -2622,17 +2587,13 @@ nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"
 nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 sympy = "*"
+triton = {version = "3.0.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
 typing-extensions = ">=4.8.0"
 
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
 optree = ["optree (>=0.11.0)"]
 
-[package.source]
-type = "legacy"
-url = "https://download.pytorch.org/whl/nightly/cpu"
-reference = "pytorch-nightly"
-
 [[package]]
 name = "torchmetrics"
 version = "1.4.0"
@@ -2681,36 +2642,42 @@ trampoline = ">=0.1.2"
 
 [[package]]
 name = "torchvision"
-version = "0.19.0.dev20240511"
+version = "0.19.0"
 description = "image and video datasets and models for torch deep learning"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "torchvision-0.19.0.dev20240511-cp310-cp310-linux_aarch64.whl", hash = "sha256:1d5a0ef004f485cbe8897b4f8352f2fc91e84531d1184287891155e15053e690"},
-    {file = "torchvision-0.19.0.dev20240511-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be987289bce21dee646e40e2d10b7aff88b42f6da358e2e85d5992b434335587"},
-    {file = "torchvision-0.19.0.dev20240511-cp311-cp311-linux_aarch64.whl", hash = "sha256:bb82dfe36f0291e253a322b50479e40b294525db9b7c2be2cc5e81877fdfaf2d"},
-    {file = "torchvision-0.19.0.dev20240511-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:716f58a100b1800a0149a923b4d292eeb35e7d1328ef0a3192e64e81b3cfde22"},
-    {file = "torchvision-0.19.0.dev20240511-cp312-cp312-linux_aarch64.whl", hash = "sha256:b2c82432ce79c2cc97aaa5e7c0f430a071560b94eb3829c2708bd9f75a3421a3"},
-    {file = "torchvision-0.19.0.dev20240511-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b2ce079b2590c295a6cd1ef44bd2c57462b7de44d5d785b655ba24cc5b2ec1bc"},
-    {file = "torchvision-0.19.0.dev20240511-cp38-cp38-linux_aarch64.whl", hash = "sha256:a0c7c9a08df9a13aa2dfaadd24a5d75a8a7d7e95ce7ed2369cf5bf7a64dfb225"},
-    {file = "torchvision-0.19.0.dev20240511-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:68b1e4b5eff68ed7868bb08dd52902326b9a7bdfec663dcc9072f882940c33d3"},
-    {file = "torchvision-0.19.0.dev20240511-cp39-cp39-linux_aarch64.whl", hash = "sha256:a314e9c75dc78e701343649baed9025991f8bf302742dd4ade68deb0a03d5029"},
-    {file = "torchvision-0.19.0.dev20240511-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cd7deb79d0eaa2f17912207726453dfea5ce14ec78bf4199dab37e0de36d656"},
+    {file = "torchvision-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec874ef85dcb24c69e600f6e276af892c80cde3ffdaeb7275efda463242bc2a8"},
+    {file = "torchvision-0.19.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:106842b1e475b14d9a04ee0d6f5477d43100e3bb78e9d31e37422384d0d84179"},
+    {file = "torchvision-0.19.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d467d434005fd05a227a2ba7af4c591bb67e6d4a97bbd06eda8da83f43e9fd07"},
+    {file = "torchvision-0.19.0-cp310-cp310-win_amd64.whl", hash = "sha256:f77ac31f7337d0f6f4b58e65582c6c93b9d9eeec7dfd7478896b5cdc19a2d60d"},
+    {file = "torchvision-0.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dbf3aa71a3899244fc884303ed3c4604a160824fefac77e82317a5463efc1d9b"},
+    {file = "torchvision-0.19.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:ec4162dc71d9db7f0b51d0f92491929c1419605ff436e1305e50de13504a1c30"},
+    {file = "torchvision-0.19.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:4e6aa4fa3f0bc3599fa071c149e651a3e6bdd67c9161794478f9f91471c406a2"},
+    {file = "torchvision-0.19.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac5525d5cc09e425b5cf5752ecf66eefbbbd8c8cd945198ce35eb01a694e6069"},
+    {file = "torchvision-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c09ef8ed184fa877f6251b620226e74f682b8f1d6b341456428d4955b8d9c670"},
+    {file = "torchvision-0.19.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:02f1dd5cfc897957535b41b0258ec452d30de044e20c2de2c75869f7708e7656"},
+    {file = "torchvision-0.19.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:be0f27a28b8e9f2ae98a31af34a4bdd2a5bf154d92bd73a5797c8d2156fb3ab6"},
+    {file = "torchvision-0.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6ba7756f75c80212e51d3576f85ea204589e0c16efdb9b835dd677bc8929a67"},
+    {file = "torchvision-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:854e967a16a9409e941b5bbe5aa357b23f7158bccb9de35ae20fd4945f05ecd1"},
+    {file = "torchvision-0.19.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:d9afb8a3c3ce99a161a64c2a3b91cb545632a72118053cbfb84e87a02a8dcd02"},
+    {file = "torchvision-0.19.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:079a696e0b2cb52e4be30afa8e9b3d7d280f02a2b5ffedd7e821fa1efd1a5a8d"},
+    {file = "torchvision-0.19.0-cp38-cp38-win_amd64.whl", hash = "sha256:aaa338ff3a55a8c0f94e0e64eff6fe2af1fc933a95fd43812760e72ea66e986b"},
+    {file = "torchvision-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dd1279571d4b68d5a53d9b7a35aedf91c4cb1e0b08099f6a1effa7b25b8c95e7"},
+    {file = "torchvision-0.19.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:4d54b5e19b7ebebca7d0b08497b4c6335264cad04c94c05fa35988d9e9eed0c4"},
+    {file = "torchvision-0.19.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5f9a598dcf82bdfc8e4436ce74763b3877dabec3b33f94613b94ede13e3e4dee"},
+    {file = "torchvision-0.19.0-cp39-cp39-win_amd64.whl", hash = "sha256:ec1281c10402234d470bfd4d53663d81f4364f293b2f8fe24d4a7a1adc78c90c"},
 ]
 
 [package.dependencies]
 numpy = "*"
 pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0"
-torch = "2.4.0.dev20240511"
+torch = "2.4.0"
 
 [package.extras]
+gdown = ["gdown (>=4.7.3)"]
 scipy = ["scipy"]
 
-[package.source]
-type = "legacy"
-url = "https://download.pytorch.org/whl/nightly/cpu"
-reference = "pytorch-nightly"
-
 [[package]]
 name = "tqdm"
 version = "4.66.4"
@@ -2807,6 +2774,33 @@ torchhub = ["filelock", "huggingface-hub (>=0.23.0,<1.0)", "importlib-metadata",
 video = ["av (==9.2.0)", "decord (==0.6.0)"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]
 
+[[package]]
+name = "triton"
+version = "3.0.0"
+description = "A language and compiler for custom Deep Learning operations"
+optional = false
+python-versions = "*"
+files = [
+    {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"},
+    {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"},
+    {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
+    {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
+    {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
+    {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
+    {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
+    {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
+    {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
+    {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
+]
+
+[package.dependencies]
+filelock = "*"
+
+[package.extras]
+build = ["cmake (>=3.20)", "lit"]
+tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
+tutorials = ["matplotlib", "pandas", "tabulate"]
+
 [[package]]
 name = "typing-extensions"
 version = "4.11.0"
@@ -3153,4 +3147,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "1a5ac00127a8eb805d231531c57bfcc18212f71be7cc0b04f6d635984b71f6eb"
+content-hash = "72aebdd01056a46ebd9f42a652df932ddbb8d537409e432bca6ff2a97b97a6c0"
diff --git a/install/apple/pyproject.toml b/install/apple/pyproject.toml
index 9b3c7248..c1fbd64a 100644
--- a/install/apple/pyproject.toml
+++ b/install/apple/pyproject.toml
@@ -8,9 +8,9 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-torch = { version = "^2.4.0.dev20240511", source = "pytorch-nightly" }
-torchvision = "^0.19.0.dev20240511"
-diffusers = {git = "https://github.com/huggingface/diffusers"}
+torch = "^2.4.0"
+torchvision = "^0.19.0"
+diffusers = {git = "https://github.com/bghira/diffusers", rev = "feature/lavender-flow-complete"}
 transformers = "^4.41.2"
 datasets = "^2.14.3"
 dadaptation = "^3.1"

From 48a983b678d14ae86dd3c4fee6fd9e7328250fc7 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Wed, 24 Jul 2024 11:57:09 -0600
Subject: [PATCH 2/5] gradient accumulation steps should use fp32 accumulations
 by default

---
 helpers/arguments.py | 24 ++++++++++++++++++++++++
 train.py             |  7 +++++++
 2 files changed, 31 insertions(+)

diff --git a/helpers/arguments.py b/helpers/arguments.py
index aed1d1b7..d9f096ee 100644
--- a/helpers/arguments.py
+++ b/helpers/arguments.py
@@ -1335,6 +1335,17 @@ def parse_args(input_args=None):
             " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
         ),
     )
+    parser.add_argument(
+        "--gradient_precision",
+        type=str,
+        choices=["unmodified", "fp32"],
+        default=None,
+        help=(
+            "One of the hallmark discoveries of the Llama 3.1 paper is numeric instability when calculating"
+            " gradients in bf16 precision. The default behaviour when gradient accumulation steps are enabled"
+            " is now to use fp32 gradients, which is slower, but provides more accurate updates."
+        ),
+    )
     parser.add_argument(
         "--local_rank",
         type=int,
@@ -1859,4 +1870,17 @@ def parse_args(input_args=None):
                     f"{'PixArt Sigma' if args.pixart_sigma else 'Stable Diffusion 3'} requires --max_grad_norm=0.01 to prevent model collapse. Overriding value. Set this value manually to disable this warning."
                 )
                 args.max_grad_norm = 0.01
+
+    if args.gradient_accumulation_steps > 1:
+        if args.gradient_precision == "unmodified":
+            warning_log(
+                "Gradient accumulation steps are enabled, but gradient precision is set to 'unmodified'."
+                " This may lead to numeric instability. Consider setting --gradient_precision=fp32."
+            )
+        elif args.gradient_precision is None or args.gradient_precision == "fp32":
+            info_log(
+                "Gradient accumulation steps are enabled, and gradient precision is set to 'fp32'."
+            )
+            args.gradient_precision = "fp32"
+
     return args
diff --git a/train.py b/train.py
index 44882259..94b77315 100644
--- a/train.py
+++ b/train.py
@@ -2037,6 +2037,13 @@ def main():
                             f"NaNs detected. Loss: {loss}, Model prediction: {model_pred}, Target: {target}"
                         )
                     accelerator.backward(loss)
+
+                    if args.gradient_precision == "fp32":
+                        # After backward, convert gradients to fp32 for stable accumulation
+                        for param in params_to_optimize:
+                            if param.grad is not None:
+                                param.grad.data = param.grad.data.to(torch.float32)
+
                     grad_norm = None
                     if (
                         accelerator.sync_gradients

From 2a4c0e4be8b454c5d403c2796901c884bc6a94f3 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Wed, 24 Jul 2024 12:16:25 -0600
Subject: [PATCH 3/5] relax enforcement of gradient accumulation precision
 boost due to vram issues

---
 helpers/arguments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/helpers/arguments.py b/helpers/arguments.py
index d9f096ee..c50ba62f 100644
--- a/helpers/arguments.py
+++ b/helpers/arguments.py
@@ -1872,12 +1872,12 @@ def parse_args(input_args=None):
                 args.max_grad_norm = 0.01
 
     if args.gradient_accumulation_steps > 1:
-        if args.gradient_precision == "unmodified":
+        if args.gradient_precision == "unmodified" or args.gradient_precision is None:
             warning_log(
                 "Gradient accumulation steps are enabled, but gradient precision is set to 'unmodified'."
                 " This may lead to numeric instability. Consider setting --gradient_precision=fp32."
             )
-        elif args.gradient_precision is None or args.gradient_precision == "fp32":
+        elif args.gradient_precision == "fp32":
             info_log(
                 "Gradient accumulation steps are enabled, and gradient precision is set to 'fp32'."
             )

From d19797032e4ae257d37a61e045cf8f08aea0b2bd Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Wed, 24 Jul 2024 13:40:27 -0600
Subject: [PATCH 4/5] update model feature lists

---
 README.md | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 81e6e8ae..52c65cdf 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,14 @@
 - [Design Philosophy](#design-philosophy)
 - [Tutorial](#tutorial)
 - [Features](#features)
-- [Hardware Requirements](#hardware-requirements)
-  - [SDXL](#sdxl)
-  - [Stable Diffusion 2.0/2.1](#stable-diffusion-2x)
+  - [PixArt Sigma](#pixart-sigma)
+  - [Stable Diffusion 2.0/2.1](#stable-diffusion-20--21)
   - [Stable Diffusion 3.0](#stable-diffusion-3)
+  - [AuraFlow](#auraflow)
+  - [Kwai Kolors](#kwai-kolors)
+- [Hardware Requirements](#hardware-requirements)
+  - [SDXL](#sdxl-1024px)
+  - [Stable Diffusion (Legacy)](#stable-diffusion-2x-768px)
 - [Scripts](#scripts)
 - [Toolkit](#toolkit)
 - [Setup](#setup)
@@ -59,7 +63,16 @@ For memory-constrained systems, see the [DeepSpeed document](/documentation/DEEP
 - Webhook support for updating eg. Discord channels with your training progress, validations, and errors
 - Integration with the [Hugging Face Hub](https://huggingface.co) for seamless model upload and nice automatically-generated model cards.
 
-### Stable Diffusion 2.0/2.1
+### PixArt Sigma
+
+SimpleTuner has extensive training integration with PixArt Sigma - both the 600M & 900M models load without any fuss.
+
+- Text encoder training is not supported, as T5 is enormous.
+- LoRA and full tuning both work as expected
+- ControlNet training is not yet supported
+- [Two-stage PixArt](https://huggingface.co/ptx0/pixart-900m-1024-ft-v0.7-stage1) training support (see: [MIXTURE_OF_EXPERTS](/documentation/MIXTURE_OF_EXPERTS.md))
+
+### Stable Diffusion 2.0 & 2.1
 
 Stable Diffusion 2.1 is known for difficulty during fine-tuning, but this doesn't have to be the case. Related features in SimpleTuner include:
 

From e6334fa18d078d90822792fdd5adf47bd3eb3597 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Wed, 24 Jul 2024 13:54:15 -0600
Subject: [PATCH 5/5] revise features in readme

---
 README.md | 56 +++++++++++++++++++++----------------------------------
 1 file changed, 21 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 52c65cdf..79acea77 100644
--- a/README.md
+++ b/README.md
@@ -4,14 +4,6 @@
 
 **SimpleTuner** is a repository dedicated to a set of experimental scripts designed for training optimization. The project is geared towards simplicity, with a focus on making the code easy to read and understand. This codebase serves as a shared academic exercise, and contributions are welcome.
 
-- Multi-GPU training
-- Aspect bucketing "just works"; fill a folder of images and let it rip
-- Multiple datasets can be used in a single training session, each with a different base resolution.
-- VRAM-saving techniques, such as pre-computing VAE and text encoder outputs
-- Full featured fine-tuning support
-  - Bias training (BitFit)
-- LoRA training support
-
 ## Table of Contents
 
 - [Design Philosophy](#design-philosophy)
@@ -25,6 +17,7 @@
 - [Hardware Requirements](#hardware-requirements)
   - [SDXL](#sdxl-1024px)
   - [Stable Diffusion (Legacy)](#stable-diffusion-2x-768px)
+  - [AuraFlow v0.1](#auraflow-v01)
 - [Scripts](#scripts)
 - [Toolkit](#toolkit)
 - [Setup](#setup)
@@ -48,21 +41,21 @@ For memory-constrained systems, see the [DeepSpeed document](/documentation/DEEP
 
 ## Features
 
-- Precomputed VAE (latents) outputs saved to storage, eliminating the need to invoke the VAE during training.
-- Precomputed captions are run through the text encoder(s) and saved to storage to save on VRAM.
-- Trainable on a 24G GPU, or even down to 16G at lower base resolutions.
-  - LoRA training for SDXL, SD3, and SD 2.x that uses less than 16G VRAM.
+- Multi-GPU training
+- Image and caption features (embeds) are cached to the hard drive in advance, so that training runs faster and with less memory consumption
+- Aspect bucketing: support for a variety of image sizes and aspect ratios, enabling widescreen and portrait training.
+- Refiner LoRA or full u-net training for SDXL
+- Most models are trainable on a 24G GPU, or even down to 16G at lower base resolutions.
+  - LoRA training for PixArt, SDXL, SD3, and SD 2.x that uses less than 16G VRAM; AuraFlow uses less than 24G VRAM
 - DeepSpeed integration allowing for [training SDXL's full u-net on 12G of VRAM](/documentation/DEEPSPEED.md), albeit very slowly.
 - Optional EMA (Exponential moving average) weight network to counteract model overfitting and improve training stability. **Note:** This does not apply to LoRA.
-- Support for a variety of image sizes and aspect ratios, enabling widescreen and portrait training.
 - Train directly from an S3-compatible storage provider, eliminating the requirement for expensive local storage. (Tested with Cloudflare R2 and Wasabi S3)
-- [DeepFloyd stage I and II full u-net or parameter-efficient fine-tuning](/documentation/DEEPFLOYD.md) via LoRA using 22G VRAM
-- SDXL Refiner LoRA or full u-net training, incl validation using img2img
-- Full [ControlNet model training](/documentation/CONTROLNET.md) (not ControlLoRA or ControlLite)
+- For only SDXL and SD 1.x/2.x, full [ControlNet model training](/documentation/CONTROLNET.md) (not ControlLoRA or ControlLite)
 - Training [Mixture of Experts](/documentation/MIXTURE_OF_EXPERTS.md) for lightweight, high-quality diffusion models
 - Webhook support for updating eg. Discord channels with your training progress, validations, and errors
 - Integration with the [Hugging Face Hub](https://huggingface.co) for seamless model upload and nice automatically-generated model cards.
 
+
 ### PixArt Sigma
 
 SimpleTuner has extensive training integration with PixArt Sigma - both the 600M & 900M models load without any fuss.
@@ -72,6 +65,8 @@ SimpleTuner has extensive training integration with PixArt Sigma - both the 600M
 - ControlNet training is not yet supported
 - [Two-stage PixArt](https://huggingface.co/ptx0/pixart-900m-1024-ft-v0.7-stage1) training support (see: [MIXTURE_OF_EXPERTS](/documentation/MIXTURE_OF_EXPERTS.md))
 
+See the [PixArt Quickstart](/documentation/quickstart/SIGMA.md) guide to start training.
+
 ### Stable Diffusion 2.0 & 2.1
 
 Stable Diffusion 2.1 is known for difficulty during fine-tuning, but this doesn't have to be the case. Related features in SimpleTuner include:
@@ -83,17 +78,12 @@ Stable Diffusion 2.1 is known for difficulty during fine-tuning, but this doesn'
 
 ### Stable Diffusion 3
 
-This model is very new and the current level of support for it in SimpleTuner is preliminary:
-
 - LoRA and full finetuning are supported as usual.
 - ControlNet is not yet implemented.
 - Certain features such as segmented timestep selection and Compel long prompt weighting are not yet supported.
+- Parameters have been optimised to get the best results, validated through from-scratch training of SD3 models
 
-A few sharp edges could catch you off-guard, but for the most part, this initial pass at SD3 support is considered to be robust enough not to let you screw up too many parameters - it will oftentimes simply override bad values and set them for more sensible ones.
-
-Simply point your base model to a Stable Diffusion 3 checkpoint and set `STABLE_DIFFUSION_3=true` in your environment file.
-
-> ⚠️ In the current source release of Diffusers, gradient checkpointing is broken for Stable Diffusion 3 models. This will result in much, much higher memory use.
+See the [Stable Diffusion 3 Quickstart](/documentation/quickstart/SD3.md) to get going.
 
 ### AuraFlow
 
@@ -103,18 +93,6 @@ Currently, AuraFlow v0.1 has limited support for SimpleTuner:
 - All limitations that apply to Stable Diffusion 3 also apply to AuraFlow
 - LoRA is currently the only viable method of AuraFlow training
 
-This model is very large, and will require more resources to train than PixArt or SDXL.
-
-AuraFlow has some distinct advantages that make it worth investigating over Stable Diffusion 3:
-
-- It is the largest open text-to-image model with a truly open license
-- It uses the SDXL 4ch VAE which arguably provides an easier learning objective over the 16ch VAE from Stable Diffusion 3
-  - Though small newspaper or book print text suffers at 4ch compression levels, the overall fine details makes this approach viable.
-- It uses just a single text encoder versus Stable Diffusion's three text encoders
-  - AuraFlow leverages EleutherAI's **Pile-T5** which was trained on **twice as much data** with **fewer parameters** than Stable Diffusion 3, DeepFloyd, and PixArt's **T5-XXL v1.1**
-  - Pile-T5 has gone through less content prefiltering than OpenCLIP or T5 v1.1, and has "consumed more of the Internet" than T5 v1.1
-  - With a large data corpus, it has potential for subtle semantic understanding of linguistic oddities, and understanding of more modern concepts without finetuning the text encoder
-
 ### Kwai Kolors
 
 An SDXL-based model with ChatGLM (General Language Model) 6B as its text encoder, **doubling** the hidden dimension size and substantially increasing the level of local detail included in the prompt embeds.
@@ -150,6 +128,14 @@ Without EMA, more care must be taken not to drastically change the model leading
 - NVIDIA RTX 4090 or better (24G, no EMA)
 - NVIDIA RTX 4080 or better (LoRA only)
 
+### AuraFlow v0.1
+
+This model is very large; it will require more resources to train than any other, incurring a substantial hardware cost.
+
+- Full tuning will OOM at a batch size of 1 on a single 80G GPU. A system with 8x A100-80G (SXM4) is a recommended minimum for FSDP (DeepSpeed ZeRO Stage 2) training.
+- LoRA training will OOM at a batch size of 1 on a single 16G GPU. A system with 1x 24G is required, with a 48G GPU being an ideal size.
+
+
 ## Scripts
 
 - `ubuntu.sh` - This is a basic "installer" that makes it quick to deploy on a Vast.ai instance. It might not work for every single container image.