From c239b79354bf750d6523d61ac8910d04a09c3990 Mon Sep 17 00:00:00 2001 From: bghira Date: Wed, 24 Jul 2024 11:31:19 -0600 Subject: [PATCH 1/5] apple: update to official pytorch 2.4 release --- install/apple/poetry.lock | 166 +++++++++++++++++------------------ install/apple/pyproject.toml | 6 +- 2 files changed, 83 insertions(+), 89 deletions(-) diff --git a/install/apple/poetry.lock b/install/apple/poetry.lock index 90170559..79bb0260 100644 --- a/install/apple/poetry.lock +++ b/install/apple/poetry.lock @@ -495,9 +495,9 @@ training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.6.0)", "pr [package.source] type = "git" -url = "https://github.com/huggingface/diffusers" -reference = "HEAD" -resolved_reference = "a9c403c00197d8d6eb854128dda2f7849cedd100" +url = "https://github.com/bghira/diffusers" +reference = "feature/lavender-flow-complete" +resolved_reference = "0d3fcdbca358c3b93c917f7fd7eac575fd95d898" [[package]] name = "dill" @@ -836,20 +836,6 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link perf = ["ipython"] testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] -[[package]] -name = "intel-openmp" -version = "2021.4.0" -description = "Intel OpenMP* Runtime Library" -optional = false -python-versions = "*" -files = [ - {file = "intel_openmp-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:41c01e266a7fdb631a7609191709322da2bbf24b252ba763f125dd651bcc7675"}, - {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:3b921236a38384e2016f0f3d65af6732cf2c12918087128a9163225451e776f2"}, - {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:e2240ab8d01472fed04f3544a878cda5da16c26232b7ea1b59132dbfb48b186e"}, - {file = "intel_openmp-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:6e863d8fd3d7e8ef389d52cf97a50fe2afe1a19247e8c0d168ce021546f96fc9"}, - {file = "intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:eef4c8bcc8acefd7f5cd3b9384dbf73d59e2c99fc56545712ded913f43c4a94f"}, -] - [[package]] name = "iterutils" version = "0.1.6" @@ -996,24 +982,6 @@ files = [ {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, ] -[[package]] -name = "mkl" -version = "2021.4.0" -description = "Intel® oneAPI Math Kernel Library" -optional = false -python-versions = "*" -files = [ - {file = "mkl-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:67460f5cd7e30e405b54d70d1ed3ca78118370b65f7327d495e9c8847705e2fb"}, - {file = "mkl-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:636d07d90e68ccc9630c654d47ce9fdeb036bb46e2b193b3a9ac8cfea683cce5"}, - {file = "mkl-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:398dbf2b0d12acaf54117a5210e8f191827f373d362d796091d161f610c1ebfb"}, - {file = "mkl-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:439c640b269a5668134e3dcbcea4350459c4a8bc46469669b2d67e07e3d330e8"}, - {file = "mkl-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:ceef3cafce4c009dd25f65d7ad0d833a0fbadc3d8903991ec92351fe5de1e718"}, -] - -[package.dependencies] -intel-openmp = "==2021.*" -tbb = "==2021.*" - [[package]] name = "mpmath" version = "1.3.0" @@ -1259,12 +1227,13 @@ files = [ [[package]] name = "nvidia-cudnn-cu12" -version = "8.9.2.26" +version = "9.1.0.70" description = "cuDNN runtime libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9"}, + {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"}, + {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"}, ] [package.dependencies] @@ -2404,19 +2373,6 @@ files = [ [package.dependencies] mpmath = ">=0.19" -[[package]] -name = "tbb" -version = "2021.12.0" -description = "Intel® oneAPI Threading Building Blocks (oneTBB)" -optional = false -python-versions = "*" -files = [ - {file = "tbb-2021.12.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:f2cc9a7f8ababaa506cbff796ce97c3bf91062ba521e15054394f773375d81d8"}, - {file = "tbb-2021.12.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:a925e9a7c77d3a46ae31c34b0bb7f801c4118e857d137b68f68a8e458fcf2bd7"}, - {file = "tbb-2021.12.0-py3-none-win32.whl", hash = "sha256:b1725b30c174048edc8be70bd43bb95473f396ce895d91151a474d0fa9f450a8"}, - {file = "tbb-2021.12.0-py3-none-win_amd64.whl", hash = "sha256:fc2772d850229f2f3df85f1109c4844c495a2db7433d38200959ee9265b34789"}, -] - [[package]] name = "tensorboard" version = "2.16.2" @@ -2587,34 +2543,43 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"] [[package]] name = "torch" -version = "2.4.0.dev20240511" +version = "2.4.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false python-versions = ">=3.8.0" files = [ - {file = "torch-2.4.0.dev20240511-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5c39786ed1fb10807a4a9a470fc9ba9f31f2e8c2698b637646c9eec1e63cac9"}, - {file = "torch-2.4.0.dev20240511-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:e08fdf100660bc98e89d575aac51aca58c3169c15d09451180415ae0642f033f"}, - {file = "torch-2.4.0.dev20240511-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78e824f60214118e813f9ade7ef1452f34230bd34fa49593c6feacefeb6d4d50"}, - {file = "torch-2.4.0.dev20240511-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ea48bdaf0da647938961d76ac6921acfe264343199417bf5de8afbe0a4318fc0"}, - {file = "torch-2.4.0.dev20240511-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7eb1ed96372840cf620dbb3fc779a53f2a4a69d36473f93d1a1771c356696e59"}, - {file = "torch-2.4.0.dev20240511-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:5afd150809c9bf0784bd604d76ce1bfb4e3e727aac0e814a229e87f8e4f8ae4f"}, - {file = "torch-2.4.0.dev20240511-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ac5080032a28612e1d07e29106f703a8b6956e5b779cf7ba1cf77e39591e698"}, - {file = "torch-2.4.0.dev20240511-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:e8f57c653400c21b61f0a5c127093480de6fea3abff31c24be38a876f20c386e"}, - {file = "torch-2.4.0.dev20240511-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:524de1132e22f3a6767089cddba89dca2edebd27e7301d9dc91b12f0b1bb30fe"}, - {file = "torch-2.4.0.dev20240511-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:22f07be89eae0e0e966d96d16ba21e18f70bbfbbfbbf46c79e8cbb1b3abb7f2c"}, + {file = "torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:4ed94583e244af51d6a8d28701ca5a9e02d1219e782f5a01dd401f90af17d8ac"}, + {file = "torch-2.4.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:c4ca297b7bd58b506bfd6e78ffd14eb97c0e7797dcd7965df62f50bb575d8954"}, + {file = "torch-2.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:2497cbc7b3c951d69b276ca51fe01c2865db67040ac67f5fc20b03e41d16ea4a"}, + {file = "torch-2.4.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:685418ab93730efbee71528821ff54005596970dd497bf03c89204fb7e3f71de"}, + {file = "torch-2.4.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e743adadd8c8152bb8373543964551a7cb7cc20ba898dc8f9c0cdbe47c283de0"}, + {file = "torch-2.4.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:7334325c0292cbd5c2eac085f449bf57d3690932eac37027e193ba775703c9e6"}, + {file = "torch-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:97730014da4c57ffacb3c09298c6ce05400606e890bd7a05008d13dd086e46b1"}, + {file = "torch-2.4.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f169b4ea6dc93b3a33319611fcc47dc1406e4dd539844dcbd2dec4c1b96e166d"}, + {file = "torch-2.4.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:997084a0f9784d2a89095a6dc67c7925e21bf25dea0b3d069b41195016ccfcbb"}, + {file = "torch-2.4.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:bc3988e8b36d1e8b998d143255d9408d8c75da4ab6dd0dcfd23b623dfb0f0f57"}, + {file = "torch-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:3374128bbf7e62cdaed6c237bfd39809fbcfaa576bee91e904706840c3f2195c"}, + {file = "torch-2.4.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:91aaf00bfe1ffa44dc5b52809d9a95129fca10212eca3ac26420eb11727c6288"}, + {file = "torch-2.4.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cc30457ea5489c62747d3306438af00c606b509d78822a88f804202ba63111ed"}, + {file = "torch-2.4.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a046491aaf96d1215e65e1fa85911ef2ded6d49ea34c8df4d0638879f2402eef"}, + {file = "torch-2.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:688eec9240f3ce775f22e1e1a5ab9894f3d5fe60f3f586deb7dbd23a46a83916"}, + {file = "torch-2.4.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:3af4de2a618fb065e78404c4ba27a818a7b7957eaeff28c6c66ce7fb504b68b8"}, + {file = "torch-2.4.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:618808d3f610d5f180e47a697d4ec90b810953bb1e020f424b2ac7fb0884b545"}, + {file = "torch-2.4.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:ed765d232d23566052ba83632ec73a4fccde00b4c94ad45d63b471b09d63b7a7"}, + {file = "torch-2.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:a2feb98ac470109472fb10dfef38622a7ee08482a16c357863ebc7bc7db7c8f7"}, + {file = "torch-2.4.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8940fc8b97a4c61fdb5d46a368f21f4a3a562a17879e932eb51a5ec62310cb31"}, ] [package.dependencies] filelock = "*" fsspec = "*" jinja2 = "*" -mkl = {version = ">=2021.1.1,<=2021.4.0", markers = "platform_system == \"Windows\""} networkx = "*" nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cudnn-cu12 = {version = "8.9.2.26", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} @@ -2622,17 +2587,13 @@ nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \" nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} sympy = "*" +triton = {version = "3.0.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""} typing-extensions = ">=4.8.0" [package.extras] opt-einsum = ["opt-einsum (>=3.3)"] optree = ["optree (>=0.11.0)"] -[package.source] -type = "legacy" -url = "https://download.pytorch.org/whl/nightly/cpu" -reference = "pytorch-nightly" - [[package]] name = "torchmetrics" version = "1.4.0" @@ -2681,36 +2642,42 @@ trampoline = ">=0.1.2" [[package]] name = "torchvision" -version = "0.19.0.dev20240511" +version = "0.19.0" description = "image and video datasets and models for torch deep learning" optional = false python-versions = ">=3.8" files = [ - {file = "torchvision-0.19.0.dev20240511-cp310-cp310-linux_aarch64.whl", hash = "sha256:1d5a0ef004f485cbe8897b4f8352f2fc91e84531d1184287891155e15053e690"}, - {file = "torchvision-0.19.0.dev20240511-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be987289bce21dee646e40e2d10b7aff88b42f6da358e2e85d5992b434335587"}, - {file = "torchvision-0.19.0.dev20240511-cp311-cp311-linux_aarch64.whl", hash = "sha256:bb82dfe36f0291e253a322b50479e40b294525db9b7c2be2cc5e81877fdfaf2d"}, - {file = "torchvision-0.19.0.dev20240511-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:716f58a100b1800a0149a923b4d292eeb35e7d1328ef0a3192e64e81b3cfde22"}, - {file = "torchvision-0.19.0.dev20240511-cp312-cp312-linux_aarch64.whl", hash = "sha256:b2c82432ce79c2cc97aaa5e7c0f430a071560b94eb3829c2708bd9f75a3421a3"}, - {file = "torchvision-0.19.0.dev20240511-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b2ce079b2590c295a6cd1ef44bd2c57462b7de44d5d785b655ba24cc5b2ec1bc"}, - {file = "torchvision-0.19.0.dev20240511-cp38-cp38-linux_aarch64.whl", hash = "sha256:a0c7c9a08df9a13aa2dfaadd24a5d75a8a7d7e95ce7ed2369cf5bf7a64dfb225"}, - {file = "torchvision-0.19.0.dev20240511-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:68b1e4b5eff68ed7868bb08dd52902326b9a7bdfec663dcc9072f882940c33d3"}, - {file = "torchvision-0.19.0.dev20240511-cp39-cp39-linux_aarch64.whl", hash = "sha256:a314e9c75dc78e701343649baed9025991f8bf302742dd4ade68deb0a03d5029"}, - {file = "torchvision-0.19.0.dev20240511-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cd7deb79d0eaa2f17912207726453dfea5ce14ec78bf4199dab37e0de36d656"}, + {file = "torchvision-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec874ef85dcb24c69e600f6e276af892c80cde3ffdaeb7275efda463242bc2a8"}, + {file = "torchvision-0.19.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:106842b1e475b14d9a04ee0d6f5477d43100e3bb78e9d31e37422384d0d84179"}, + {file = "torchvision-0.19.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d467d434005fd05a227a2ba7af4c591bb67e6d4a97bbd06eda8da83f43e9fd07"}, + {file = "torchvision-0.19.0-cp310-cp310-win_amd64.whl", hash = "sha256:f77ac31f7337d0f6f4b58e65582c6c93b9d9eeec7dfd7478896b5cdc19a2d60d"}, + {file = "torchvision-0.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dbf3aa71a3899244fc884303ed3c4604a160824fefac77e82317a5463efc1d9b"}, + {file = "torchvision-0.19.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:ec4162dc71d9db7f0b51d0f92491929c1419605ff436e1305e50de13504a1c30"}, + {file = "torchvision-0.19.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:4e6aa4fa3f0bc3599fa071c149e651a3e6bdd67c9161794478f9f91471c406a2"}, + {file = "torchvision-0.19.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac5525d5cc09e425b5cf5752ecf66eefbbbd8c8cd945198ce35eb01a694e6069"}, + {file = "torchvision-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c09ef8ed184fa877f6251b620226e74f682b8f1d6b341456428d4955b8d9c670"}, + {file = "torchvision-0.19.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:02f1dd5cfc897957535b41b0258ec452d30de044e20c2de2c75869f7708e7656"}, + {file = "torchvision-0.19.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:be0f27a28b8e9f2ae98a31af34a4bdd2a5bf154d92bd73a5797c8d2156fb3ab6"}, + {file = "torchvision-0.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6ba7756f75c80212e51d3576f85ea204589e0c16efdb9b835dd677bc8929a67"}, + {file = "torchvision-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:854e967a16a9409e941b5bbe5aa357b23f7158bccb9de35ae20fd4945f05ecd1"}, + {file = "torchvision-0.19.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:d9afb8a3c3ce99a161a64c2a3b91cb545632a72118053cbfb84e87a02a8dcd02"}, + {file = "torchvision-0.19.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:079a696e0b2cb52e4be30afa8e9b3d7d280f02a2b5ffedd7e821fa1efd1a5a8d"}, + {file = "torchvision-0.19.0-cp38-cp38-win_amd64.whl", hash = "sha256:aaa338ff3a55a8c0f94e0e64eff6fe2af1fc933a95fd43812760e72ea66e986b"}, + {file = "torchvision-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dd1279571d4b68d5a53d9b7a35aedf91c4cb1e0b08099f6a1effa7b25b8c95e7"}, + {file = "torchvision-0.19.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:4d54b5e19b7ebebca7d0b08497b4c6335264cad04c94c05fa35988d9e9eed0c4"}, + {file = "torchvision-0.19.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5f9a598dcf82bdfc8e4436ce74763b3877dabec3b33f94613b94ede13e3e4dee"}, + {file = "torchvision-0.19.0-cp39-cp39-win_amd64.whl", hash = "sha256:ec1281c10402234d470bfd4d53663d81f4364f293b2f8fe24d4a7a1adc78c90c"}, ] [package.dependencies] numpy = "*" pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0" -torch = "2.4.0.dev20240511" +torch = "2.4.0" [package.extras] +gdown = ["gdown (>=4.7.3)"] scipy = ["scipy"] -[package.source] -type = "legacy" -url = "https://download.pytorch.org/whl/nightly/cpu" -reference = "pytorch-nightly" - [[package]] name = "tqdm" version = "4.66.4" @@ -2807,6 +2774,33 @@ torchhub = ["filelock", "huggingface-hub (>=0.23.0,<1.0)", "importlib-metadata", video = ["av (==9.2.0)", "decord (==0.6.0)"] vision = ["Pillow (>=10.0.1,<=15.0)"] +[[package]] +name = "triton" +version = "3.0.0" +description = "A language and compiler for custom Deep Learning operations" +optional = false +python-versions = "*" +files = [ + {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"}, + {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"}, + {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, + {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, + {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"}, + {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"}, + {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"}, + {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"}, + {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"}, +] + +[package.dependencies] +filelock = "*" + +[package.extras] +build = ["cmake (>=3.20)", "lit"] +tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"] +tutorials = ["matplotlib", "pandas", "tabulate"] + [[package]] name = "typing-extensions" version = "4.11.0" @@ -3153,4 +3147,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "1a5ac00127a8eb805d231531c57bfcc18212f71be7cc0b04f6d635984b71f6eb" +content-hash = "72aebdd01056a46ebd9f42a652df932ddbb8d537409e432bca6ff2a97b97a6c0" diff --git a/install/apple/pyproject.toml b/install/apple/pyproject.toml index 9b3c7248..c1fbd64a 100644 --- a/install/apple/pyproject.toml +++ b/install/apple/pyproject.toml @@ -8,9 +8,9 @@ readme = "README.md" [tool.poetry.dependencies] python = ">=3.9,<3.13" -torch = { version = "^2.4.0.dev20240511", source = "pytorch-nightly" } -torchvision = "^0.19.0.dev20240511" -diffusers = {git = "https://github.com/huggingface/diffusers"} +torch = "^2.4.0" +torchvision = "^0.19.0" +diffusers = {git = "https://github.com/bghira/diffusers", rev = "feature/lavender-flow-complete"} transformers = "^4.41.2" datasets = "^2.14.3" dadaptation = "^3.1" From 48a983b678d14ae86dd3c4fee6fd9e7328250fc7 Mon Sep 17 00:00:00 2001 From: bghira Date: Wed, 24 Jul 2024 11:57:09 -0600 Subject: [PATCH 2/5] gradient accumulation steps should use fp32 accumulations by default --- helpers/arguments.py | 24 ++++++++++++++++++++++++ train.py | 7 +++++++ 2 files changed, 31 insertions(+) diff --git a/helpers/arguments.py b/helpers/arguments.py index aed1d1b7..d9f096ee 100644 --- a/helpers/arguments.py +++ b/helpers/arguments.py @@ -1335,6 +1335,17 @@ def parse_args(input_args=None): " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." ), ) + parser.add_argument( + "--gradient_precision", + type=str, + choices=["unmodified", "fp32"], + default=None, + help=( + "One of the hallmark discoveries of the Llama 3.1 paper is numeric instability when calculating" + " gradients in bf16 precision. The default behaviour when gradient accumulation steps are enabled" + " is now to use fp32 gradients, which is slower, but provides more accurate updates." + ), + ) parser.add_argument( "--local_rank", type=int, @@ -1859,4 +1870,17 @@ def parse_args(input_args=None): f"{'PixArt Sigma' if args.pixart_sigma else 'Stable Diffusion 3'} requires --max_grad_norm=0.01 to prevent model collapse. Overriding value. Set this value manually to disable this warning." ) args.max_grad_norm = 0.01 + + if args.gradient_accumulation_steps > 1: + if args.gradient_precision == "unmodified": + warning_log( + "Gradient accumulation steps are enabled, but gradient precision is set to 'unmodified'." + " This may lead to numeric instability. Consider setting --gradient_precision=fp32." + ) + elif args.gradient_precision is None or args.gradient_precision == "fp32": + info_log( + "Gradient accumulation steps are enabled, and gradient precision is set to 'fp32'." + ) + args.gradient_precision = "fp32" + return args diff --git a/train.py b/train.py index 44882259..94b77315 100644 --- a/train.py +++ b/train.py @@ -2037,6 +2037,13 @@ def main(): f"NaNs detected. Loss: {loss}, Model prediction: {model_pred}, Target: {target}" ) accelerator.backward(loss) + + if args.gradient_precision == "fp32": + # After backward, convert gradients to fp32 for stable accumulation + for param in params_to_optimize: + if param.grad is not None: + param.grad.data = param.grad.data.to(torch.float32) + grad_norm = None if ( accelerator.sync_gradients From 2a4c0e4be8b454c5d403c2796901c884bc6a94f3 Mon Sep 17 00:00:00 2001 From: bghira Date: Wed, 24 Jul 2024 12:16:25 -0600 Subject: [PATCH 3/5] relax enforcement of gradient accumulation precision boost due to vram issues --- helpers/arguments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helpers/arguments.py b/helpers/arguments.py index d9f096ee..c50ba62f 100644 --- a/helpers/arguments.py +++ b/helpers/arguments.py @@ -1872,12 +1872,12 @@ def parse_args(input_args=None): args.max_grad_norm = 0.01 if args.gradient_accumulation_steps > 1: - if args.gradient_precision == "unmodified": + if args.gradient_precision == "unmodified" or args.gradient_precision is None: warning_log( "Gradient accumulation steps are enabled, but gradient precision is set to 'unmodified'." " This may lead to numeric instability. Consider setting --gradient_precision=fp32." ) - elif args.gradient_precision is None or args.gradient_precision == "fp32": + elif args.gradient_precision == "fp32": info_log( "Gradient accumulation steps are enabled, and gradient precision is set to 'fp32'." ) From d19797032e4ae257d37a61e045cf8f08aea0b2bd Mon Sep 17 00:00:00 2001 From: bghira Date: Wed, 24 Jul 2024 13:40:27 -0600 Subject: [PATCH 4/5] update model feature lists --- README.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 81e6e8ae..52c65cdf 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,14 @@ - [Design Philosophy](#design-philosophy) - [Tutorial](#tutorial) - [Features](#features) -- [Hardware Requirements](#hardware-requirements) - - [SDXL](#sdxl) - - [Stable Diffusion 2.0/2.1](#stable-diffusion-2x) + - [PixArt Sigma](#pixart-sigma) + - [Stable Diffusion 2.0/2.1](#stable-diffusion-20--21) - [Stable Diffusion 3.0](#stable-diffusion-3) + - [AuraFlow](#auraflow) + - [Kwai Kolors](#kwai-kolors) +- [Hardware Requirements](#hardware-requirements) + - [SDXL](#sdxl-1024px) + - [Stable Diffusion (Legacy)](#stable-diffusion-2x-768px) - [Scripts](#scripts) - [Toolkit](#toolkit) - [Setup](#setup) @@ -59,7 +63,16 @@ For memory-constrained systems, see the [DeepSpeed document](/documentation/DEEP - Webhook support for updating eg. Discord channels with your training progress, validations, and errors - Integration with the [Hugging Face Hub](https://huggingface.co) for seamless model upload and nice automatically-generated model cards. -### Stable Diffusion 2.0/2.1 +### PixArt Sigma + +SimpleTuner has extensive training integration with PixArt Sigma - both the 600M & 900M models load without any fuss. + +- Text encoder training is not supported, as T5 is enormous. +- LoRA and full tuning both work as expected +- ControlNet training is not yet supported +- [Two-stage PixArt](https://huggingface.co/ptx0/pixart-900m-1024-ft-v0.7-stage1) training support (see: [MIXTURE_OF_EXPERTS](/documentation/MIXTURE_OF_EXPERTS.md)) + +### Stable Diffusion 2.0 & 2.1 Stable Diffusion 2.1 is known for difficulty during fine-tuning, but this doesn't have to be the case. Related features in SimpleTuner include: From e6334fa18d078d90822792fdd5adf47bd3eb3597 Mon Sep 17 00:00:00 2001 From: bghira Date: Wed, 24 Jul 2024 13:54:15 -0600 Subject: [PATCH 5/5] revise features in readme --- README.md | 56 +++++++++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 52c65cdf..79acea77 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,6 @@ **SimpleTuner** is a repository dedicated to a set of experimental scripts designed for training optimization. The project is geared towards simplicity, with a focus on making the code easy to read and understand. This codebase serves as a shared academic exercise, and contributions are welcome. -- Multi-GPU training -- Aspect bucketing "just works"; fill a folder of images and let it rip -- Multiple datasets can be used in a single training session, each with a different base resolution. -- VRAM-saving techniques, such as pre-computing VAE and text encoder outputs -- Full featured fine-tuning support - - Bias training (BitFit) -- LoRA training support - ## Table of Contents - [Design Philosophy](#design-philosophy) @@ -25,6 +17,7 @@ - [Hardware Requirements](#hardware-requirements) - [SDXL](#sdxl-1024px) - [Stable Diffusion (Legacy)](#stable-diffusion-2x-768px) + - [AuraFlow v0.1](#auraflow-v01) - [Scripts](#scripts) - [Toolkit](#toolkit) - [Setup](#setup) @@ -48,21 +41,21 @@ For memory-constrained systems, see the [DeepSpeed document](/documentation/DEEP ## Features -- Precomputed VAE (latents) outputs saved to storage, eliminating the need to invoke the VAE during training. -- Precomputed captions are run through the text encoder(s) and saved to storage to save on VRAM. -- Trainable on a 24G GPU, or even down to 16G at lower base resolutions. - - LoRA training for SDXL, SD3, and SD 2.x that uses less than 16G VRAM. +- Multi-GPU training +- Image and caption features (embeds) are cached to the hard drive in advance, so that training runs faster and with less memory consumption +- Aspect bucketing: support for a variety of image sizes and aspect ratios, enabling widescreen and portrait training. +- Refiner LoRA or full u-net training for SDXL +- Most models are trainable on a 24G GPU, or even down to 16G at lower base resolutions. + - LoRA training for PixArt, SDXL, SD3, and SD 2.x that uses less than 16G VRAM; AuraFlow uses less than 24G VRAM - DeepSpeed integration allowing for [training SDXL's full u-net on 12G of VRAM](/documentation/DEEPSPEED.md), albeit very slowly. - Optional EMA (Exponential moving average) weight network to counteract model overfitting and improve training stability. **Note:** This does not apply to LoRA. -- Support for a variety of image sizes and aspect ratios, enabling widescreen and portrait training. - Train directly from an S3-compatible storage provider, eliminating the requirement for expensive local storage. (Tested with Cloudflare R2 and Wasabi S3) -- [DeepFloyd stage I and II full u-net or parameter-efficient fine-tuning](/documentation/DEEPFLOYD.md) via LoRA using 22G VRAM -- SDXL Refiner LoRA or full u-net training, incl validation using img2img -- Full [ControlNet model training](/documentation/CONTROLNET.md) (not ControlLoRA or ControlLite) +- For only SDXL and SD 1.x/2.x, full [ControlNet model training](/documentation/CONTROLNET.md) (not ControlLoRA or ControlLite) - Training [Mixture of Experts](/documentation/MIXTURE_OF_EXPERTS.md) for lightweight, high-quality diffusion models - Webhook support for updating eg. Discord channels with your training progress, validations, and errors - Integration with the [Hugging Face Hub](https://huggingface.co) for seamless model upload and nice automatically-generated model cards. + ### PixArt Sigma SimpleTuner has extensive training integration with PixArt Sigma - both the 600M & 900M models load without any fuss. @@ -72,6 +65,8 @@ SimpleTuner has extensive training integration with PixArt Sigma - both the 600M - ControlNet training is not yet supported - [Two-stage PixArt](https://huggingface.co/ptx0/pixart-900m-1024-ft-v0.7-stage1) training support (see: [MIXTURE_OF_EXPERTS](/documentation/MIXTURE_OF_EXPERTS.md)) +See the [PixArt Quickstart](/documentation/quickstart/SIGMA.md) guide to start training. + ### Stable Diffusion 2.0 & 2.1 Stable Diffusion 2.1 is known for difficulty during fine-tuning, but this doesn't have to be the case. Related features in SimpleTuner include: @@ -83,17 +78,12 @@ Stable Diffusion 2.1 is known for difficulty during fine-tuning, but this doesn' ### Stable Diffusion 3 -This model is very new and the current level of support for it in SimpleTuner is preliminary: - - LoRA and full finetuning are supported as usual. - ControlNet is not yet implemented. - Certain features such as segmented timestep selection and Compel long prompt weighting are not yet supported. +- Parameters have been optimised to get the best results, validated through from-scratch training of SD3 models -A few sharp edges could catch you off-guard, but for the most part, this initial pass at SD3 support is considered to be robust enough not to let you screw up too many parameters - it will oftentimes simply override bad values and set them for more sensible ones. - -Simply point your base model to a Stable Diffusion 3 checkpoint and set `STABLE_DIFFUSION_3=true` in your environment file. - -> ⚠️ In the current source release of Diffusers, gradient checkpointing is broken for Stable Diffusion 3 models. This will result in much, much higher memory use. +See the [Stable Diffusion 3 Quickstart](/documentation/quickstart/SD3.md) to get going. ### AuraFlow @@ -103,18 +93,6 @@ Currently, AuraFlow v0.1 has limited support for SimpleTuner: - All limitations that apply to Stable Diffusion 3 also apply to AuraFlow - LoRA is currently the only viable method of AuraFlow training -This model is very large, and will require more resources to train than PixArt or SDXL. - -AuraFlow has some distinct advantages that make it worth investigating over Stable Diffusion 3: - -- It is the largest open text-to-image model with a truly open license -- It uses the SDXL 4ch VAE which arguably provides an easier learning objective over the 16ch VAE from Stable Diffusion 3 - - Though small newspaper or book print text suffers at 4ch compression levels, the overall fine details makes this approach viable. -- It uses just a single text encoder versus Stable Diffusion's three text encoders - - AuraFlow leverages EleutherAI's **Pile-T5** which was trained on **twice as much data** with **fewer parameters** than Stable Diffusion 3, DeepFloyd, and PixArt's **T5-XXL v1.1** - - Pile-T5 has gone through less content prefiltering than OpenCLIP or T5 v1.1, and has "consumed more of the Internet" than T5 v1.1 - - With a large data corpus, it has potential for subtle semantic understanding of linguistic oddities, and understanding of more modern concepts without finetuning the text encoder - ### Kwai Kolors An SDXL-based model with ChatGLM (General Language Model) 6B as its text encoder, **doubling** the hidden dimension size and substantially increasing the level of local detail included in the prompt embeds. @@ -150,6 +128,14 @@ Without EMA, more care must be taken not to drastically change the model leading - NVIDIA RTX 4090 or better (24G, no EMA) - NVIDIA RTX 4080 or better (LoRA only) +### AuraFlow v0.1 + +This model is very large; it will require more resources to train than any other, incurring a substantial hardware cost. + +- Full tuning will OOM at a batch size of 1 on a single 80G GPU. A system with 8x A100-80G (SXM4) is a recommended minimum for FSDP (DeepSpeed ZeRO Stage 2) training. +- LoRA training will OOM at a batch size of 1 on a single 16G GPU. A system with 1x 24G is required, with a 48G GPU being an ideal size. + + ## Scripts - `ubuntu.sh` - This is a basic "installer" that makes it quick to deploy on a Vast.ai instance. It might not work for every single container image.