From fe481b74a670718a6c927e27d4e0d2a74f732687 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Wed, 23 Oct 2024 22:15:37 +0100 Subject: [PATCH 1/6] KEP-2170: Create model and dataset initializers Signed-off-by: Andrey Velichkevich --- .github/workflows/publish-core-images.yaml | 8 ++++ .gitignore | 4 +- cmd/initiailizer_v2/dataset/Dockerfile | 13 ++++++ cmd/initiailizer_v2/dataset/requirements.txt | 1 + cmd/initiailizer_v2/model/Dockerfile | 13 ++++++ cmd/initiailizer_v2/model/requirements.txt | 1 + pkg/initiailizer_v2/dataset/__main__.py | 35 ++++++++++++++++ pkg/initiailizer_v2/dataset/config.py | 8 ++++ pkg/initiailizer_v2/dataset/huggingface.py | 40 +++++++++++++++++++ pkg/initiailizer_v2/model/__main__.py | 37 +++++++++++++++++ pkg/initiailizer_v2/model/config.py | 9 +++++ pkg/initiailizer_v2/model/huggingface.py | 42 ++++++++++++++++++++ pkg/initiailizer_v2/utils/__init__.py | 0 pkg/initiailizer_v2/utils/utils.py | 16 ++++++++ 14 files changed, 225 insertions(+), 2 deletions(-) create mode 100644 cmd/initiailizer_v2/dataset/Dockerfile create mode 100644 cmd/initiailizer_v2/dataset/requirements.txt create mode 100644 cmd/initiailizer_v2/model/Dockerfile create mode 100644 cmd/initiailizer_v2/model/requirements.txt create mode 100644 pkg/initiailizer_v2/dataset/__main__.py create mode 100644 pkg/initiailizer_v2/dataset/config.py create mode 100644 pkg/initiailizer_v2/dataset/huggingface.py create mode 100644 pkg/initiailizer_v2/model/__main__.py create mode 100644 pkg/initiailizer_v2/model/config.py create mode 100644 pkg/initiailizer_v2/model/huggingface.py create mode 100644 pkg/initiailizer_v2/utils/__init__.py create mode 100644 pkg/initiailizer_v2/utils/utils.py diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 690248a07e..22426b94ec 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -30,6 +30,14 @@ jobs: dockerfile: cmd/training-operator.v2alpha1/Dockerfile platforms: linux/amd64,linux/arm64,linux/ppc64le tag-prefix: v2alpha1 + - component-name: model-initiailizer-v2 + dockerfile: cmd/initiailizer_v2/model/Dockerfile + platforms: linux/amd64,linux/arm64 + tag-prefix: v2 + - component-name: dataset-initiailizer-v2 + dockerfile: cmd/initiailizer_v2/dataset/Dockerfile + platforms: linux/amd64,linux/arm64 + tag-prefix: v2 - component-name: kubectl-delivery dockerfile: build/images/kubectl-delivery/Dockerfile platforms: linux/amd64,linux/arm64,linux/ppc64le diff --git a/.gitignore b/.gitignore index 2880c6d9e8..30b973c743 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,8 @@ cover.out .vscode/ __debug_bin -# Compiled python files. -*.pyc +# Python chache files +__pycache__/ # Emacs temporary files *~ diff --git a/cmd/initiailizer_v2/dataset/Dockerfile b/cmd/initiailizer_v2/dataset/Dockerfile new file mode 100644 index 0000000000..06b6b11076 --- /dev/null +++ b/cmd/initiailizer_v2/dataset/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-alpine + +WORKDIR /workspace + +# Copy the required Python modules. +COPY cmd/initiailizer_v2/dataset/requirements.txt . +COPY sdk/python/kubeflow sdk/python/kubeflow +COPY pkg/initiailizer_v2 pkg/initiailizer_v2 + +# Install the needed packages. +RUN pip install -r requirements.txt + +ENTRYPOINT ["python", "-m", "pkg.initiailizer_v2.dataset"] diff --git a/cmd/initiailizer_v2/dataset/requirements.txt b/cmd/initiailizer_v2/dataset/requirements.txt new file mode 100644 index 0000000000..6c9013ec2c --- /dev/null +++ b/cmd/initiailizer_v2/dataset/requirements.txt @@ -0,0 +1 @@ +huggingface_hub==0.23.4 diff --git a/cmd/initiailizer_v2/model/Dockerfile b/cmd/initiailizer_v2/model/Dockerfile new file mode 100644 index 0000000000..2960d58cb2 --- /dev/null +++ b/cmd/initiailizer_v2/model/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-alpine + +WORKDIR /workspace + +# Copy the required Python modules. +COPY cmd/initiailizer_v2/model/requirements.txt . +COPY sdk/python/kubeflow sdk/python/kubeflow +COPY pkg/initiailizer_v2 pkg/initiailizer_v2 + +# Install the needed packages. +RUN pip install -r requirements.txt + +ENTRYPOINT ["python", "-m", "pkg.initiailizer_v2.model"] diff --git a/cmd/initiailizer_v2/model/requirements.txt b/cmd/initiailizer_v2/model/requirements.txt new file mode 100644 index 0000000000..6c9013ec2c --- /dev/null +++ b/cmd/initiailizer_v2/model/requirements.txt @@ -0,0 +1 @@ +huggingface_hub==0.23.4 diff --git a/pkg/initiailizer_v2/dataset/__main__.py b/pkg/initiailizer_v2/dataset/__main__.py new file mode 100644 index 0000000000..bea7966e75 --- /dev/null +++ b/pkg/initiailizer_v2/dataset/__main__.py @@ -0,0 +1,35 @@ +import logging +import os +from urllib.parse import urlparse + +import pkg.initiailizer_v2.utils.utils as utils +from pkg.initiailizer_v2.dataset.huggingface import HuggingFace + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + +if __name__ == "__main__": + logging.info("Starting dataset initialization") + + try: + storage_uri = os.environ[utils.STORAGE_URI_ENV] + except Exception as e: + logging.error("STORAGE_URI env variable must be set.") + raise e + + logging.info(f"Storage URI: {storage_uri}") + + storage_uri_parsed = urlparse(storage_uri) + + match storage_uri_parsed.scheme: + # TODO (andreyvelich): Implement more dataset providers. + case utils.HF_SCHEME: + hf = HuggingFace() + hf.load_config() + hf.download_dataset(storage_uri_parsed) + case _: + logging.error("STORAGE_URI must have the valid dataset provider") + raise Exception diff --git a/pkg/initiailizer_v2/dataset/config.py b/pkg/initiailizer_v2/dataset/config.py new file mode 100644 index 0000000000..210b3e43ab --- /dev/null +++ b/pkg/initiailizer_v2/dataset/config.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from typing import Optional + + +# TODO (andreyvelich): This should be moved under Training V2 SDK. +@dataclass +class HuggingFaceDatasetConfig: + access_token: Optional[str] = None diff --git a/pkg/initiailizer_v2/dataset/huggingface.py b/pkg/initiailizer_v2/dataset/huggingface.py new file mode 100644 index 0000000000..ca4c1817d9 --- /dev/null +++ b/pkg/initiailizer_v2/dataset/huggingface.py @@ -0,0 +1,40 @@ +import logging +from urllib.parse import ParseResult + +import huggingface_hub + +import pkg.initiailizer_v2.utils.utils as utils + +# TODO (andreyvelich): This should be moved to SDK V2 constants. +import sdk.python.kubeflow.storage_initializer.constants as constants +from pkg.initiailizer_v2.dataset.config import HuggingFaceDatasetConfig + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + + +class HuggingFace: + + def load_config(self): + config_dict = utils.get_config_from_env(HuggingFaceDatasetConfig) + logging.info(f"Config for HuggingFace dataset initiailizer: {config_dict}") + self.config = HuggingFaceDatasetConfig(**config_dict) + + def download_dataset(self, storage_uri_parsed: ParseResult): + dataset_uri = storage_uri_parsed.netloc + storage_uri_parsed.path + logging.info(f"Downloading dataset: {dataset_uri}") + logging.info("-" * 40) + + if self.config.access_token: + huggingface_hub.login(self.config.access_token) + + huggingface_hub.snapshot_download( + repo_id=dataset_uri, + repo_type="dataset", + local_dir=constants.VOLUME_PATH_DATASET, + ) + + logging.info("Dataset has been downloaded") diff --git a/pkg/initiailizer_v2/model/__main__.py b/pkg/initiailizer_v2/model/__main__.py new file mode 100644 index 0000000000..85621a8cf6 --- /dev/null +++ b/pkg/initiailizer_v2/model/__main__.py @@ -0,0 +1,37 @@ +import logging +import os +from urllib.parse import urlparse + +import pkg.initiailizer_v2.utils.utils as utils +from pkg.initiailizer_v2.model.huggingface import HuggingFace + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + +if __name__ == "__main__": + logging.info("Starting pre-trained model initialization") + + try: + storage_uri = os.environ[utils.STORAGE_URI_ENV] + except Exception as e: + logging.error("STORAGE_URI env variable must be set.") + raise e + + logging.info(f"Storage URI: {storage_uri}") + + storage_uri_parsed = urlparse(storage_uri) + + match storage_uri_parsed.scheme: + # TODO (andreyvelich): Implement more model providers. + case utils.HF_SCHEME: + hf = HuggingFace() + hf.load_config() + hf.download_model(storage_uri_parsed) + case _: + logging.error( + f"STORAGE_URI must have the valid model provider. STORAGE_URI: {storage_uri}" + ) + raise Exception diff --git a/pkg/initiailizer_v2/model/config.py b/pkg/initiailizer_v2/model/config.py new file mode 100644 index 0000000000..827a08c456 --- /dev/null +++ b/pkg/initiailizer_v2/model/config.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass +from typing import Optional + + +# TODO (andreyvelich): This should be moved under Training V2 SDK. +@dataclass +class HuggingFaceModelInputConfig: + invalid: str + access_token: Optional[str] = None diff --git a/pkg/initiailizer_v2/model/huggingface.py b/pkg/initiailizer_v2/model/huggingface.py new file mode 100644 index 0000000000..df49b16d46 --- /dev/null +++ b/pkg/initiailizer_v2/model/huggingface.py @@ -0,0 +1,42 @@ +import logging +from urllib.parse import ParseResult + +import huggingface_hub + +import pkg.initiailizer_v2.utils.utils as utils + +# TODO (andreyvelich): This should be moved to SDK V2 constants. +import sdk.python.kubeflow.storage_initializer.constants as constants +from pkg.initiailizer_v2.model.config import HuggingFaceModelInputConfig + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + + +class HuggingFace: + + def load_config(self): + config_dict = utils.get_config_from_env(HuggingFaceModelInputConfig) + logging.info(f"Config for HuggingFace model initiailizer: {config_dict}") + self.config = HuggingFaceModelInputConfig(**config_dict) + + def download_model(self, storage_uri_parsed: ParseResult): + model_uri = storage_uri_parsed.netloc + storage_uri_parsed.path + logging.info(f"Downloading model: {model_uri}") + logging.info("-" * 40) + + if self.config.access_token: + huggingface_hub.login(self.config.access_token) + + # TODO (andreyvelich): We should verify these patterns for different models. + huggingface_hub.snapshot_download( + repo_id=model_uri, + local_dir=constants.VOLUME_PATH_MODEL, + allow_patterns=["*.json", "*.safetensors", "*.model"], + ignore_patterns=["*.msgpack", "*.h5", "*.bin"], + ) + + logging.info("Model has been downloaded") diff --git a/pkg/initiailizer_v2/utils/__init__.py b/pkg/initiailizer_v2/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pkg/initiailizer_v2/utils/utils.py b/pkg/initiailizer_v2/utils/utils.py new file mode 100644 index 0000000000..add79768ce --- /dev/null +++ b/pkg/initiailizer_v2/utils/utils.py @@ -0,0 +1,16 @@ +import os +from dataclasses import fields +from typing import Dict + +STORAGE_URI_ENV = "STORAGE_URI" +HF_SCHEME = "hf" + + +# Get DataClass config from the environment variables. +# Env names must be equal to the DataClass parameters. +def get_config_from_env(config) -> Dict[str, str]: + config_from_env = {} + for field in fields(config): + config_from_env[field.name] = os.getenv(field.name.upper()) + + return config_from_env From 59d32243ca2489a099c6f0cab2053f468c0e0afe Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Wed, 23 Oct 2024 22:28:10 +0100 Subject: [PATCH 2/6] Add abstract classes Signed-off-by: Andrey Velichkevich --- pkg/initiailizer_v2/dataset/huggingface.py | 2 +- pkg/initiailizer_v2/model/huggingface.py | 2 +- pkg/initiailizer_v2/utils/utils.py | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pkg/initiailizer_v2/dataset/huggingface.py b/pkg/initiailizer_v2/dataset/huggingface.py index ca4c1817d9..deb86bb2f8 100644 --- a/pkg/initiailizer_v2/dataset/huggingface.py +++ b/pkg/initiailizer_v2/dataset/huggingface.py @@ -16,7 +16,7 @@ ) -class HuggingFace: +class HuggingFace(utils.DatasetProvider): def load_config(self): config_dict = utils.get_config_from_env(HuggingFaceDatasetConfig) diff --git a/pkg/initiailizer_v2/model/huggingface.py b/pkg/initiailizer_v2/model/huggingface.py index df49b16d46..d6d13ae3f4 100644 --- a/pkg/initiailizer_v2/model/huggingface.py +++ b/pkg/initiailizer_v2/model/huggingface.py @@ -16,7 +16,7 @@ ) -class HuggingFace: +class HuggingFace(utils.ModelProvider): def load_config(self): config_dict = utils.get_config_from_env(HuggingFaceModelInputConfig) diff --git a/pkg/initiailizer_v2/utils/utils.py b/pkg/initiailizer_v2/utils/utils.py index add79768ce..aef7262f78 100644 --- a/pkg/initiailizer_v2/utils/utils.py +++ b/pkg/initiailizer_v2/utils/utils.py @@ -1,4 +1,5 @@ import os +from abc import ABC, abstractmethod from dataclasses import fields from typing import Dict @@ -6,6 +7,26 @@ HF_SCHEME = "hf" +class ModelProvider(ABC): + @abstractmethod + def load_config(self): + raise NotImplementedError() + + @abstractmethod + def download_model(self): + raise NotImplementedError() + + +class DatasetProvider(ABC): + @abstractmethod + def load_config(self): + raise NotImplementedError() + + @abstractmethod + def download_dataset(self): + raise NotImplementedError() + + # Get DataClass config from the environment variables. # Env names must be equal to the DataClass parameters. def get_config_from_env(config) -> Dict[str, str]: From 45c860c082d2f3ac3a5906980e1c3338f0ee058d Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Wed, 23 Oct 2024 22:44:06 +0100 Subject: [PATCH 3/6] Add storage URI to config Signed-off-by: Andrey Velichkevich --- pkg/initiailizer_v2/dataset/__main__.py | 8 ++------ pkg/initiailizer_v2/dataset/config.py | 1 + pkg/initiailizer_v2/dataset/huggingface.py | 6 ++++-- pkg/initiailizer_v2/model/__main__.py | 8 ++------ pkg/initiailizer_v2/model/config.py | 2 +- pkg/initiailizer_v2/model/huggingface.py | 6 ++++-- 6 files changed, 14 insertions(+), 17 deletions(-) diff --git a/pkg/initiailizer_v2/dataset/__main__.py b/pkg/initiailizer_v2/dataset/__main__.py index bea7966e75..8b7c9fb766 100644 --- a/pkg/initiailizer_v2/dataset/__main__.py +++ b/pkg/initiailizer_v2/dataset/__main__.py @@ -20,16 +20,12 @@ logging.error("STORAGE_URI env variable must be set.") raise e - logging.info(f"Storage URI: {storage_uri}") - - storage_uri_parsed = urlparse(storage_uri) - - match storage_uri_parsed.scheme: + match urlparse(storage_uri).scheme: # TODO (andreyvelich): Implement more dataset providers. case utils.HF_SCHEME: hf = HuggingFace() hf.load_config() - hf.download_dataset(storage_uri_parsed) + hf.download_dataset() case _: logging.error("STORAGE_URI must have the valid dataset provider") raise Exception diff --git a/pkg/initiailizer_v2/dataset/config.py b/pkg/initiailizer_v2/dataset/config.py index 210b3e43ab..e9af31e1ef 100644 --- a/pkg/initiailizer_v2/dataset/config.py +++ b/pkg/initiailizer_v2/dataset/config.py @@ -5,4 +5,5 @@ # TODO (andreyvelich): This should be moved under Training V2 SDK. @dataclass class HuggingFaceDatasetConfig: + storage_uri: str access_token: Optional[str] = None diff --git a/pkg/initiailizer_v2/dataset/huggingface.py b/pkg/initiailizer_v2/dataset/huggingface.py index deb86bb2f8..d4f1280c69 100644 --- a/pkg/initiailizer_v2/dataset/huggingface.py +++ b/pkg/initiailizer_v2/dataset/huggingface.py @@ -1,5 +1,5 @@ import logging -from urllib.parse import ParseResult +from urllib.parse import urlparse import huggingface_hub @@ -23,8 +23,10 @@ def load_config(self): logging.info(f"Config for HuggingFace dataset initiailizer: {config_dict}") self.config = HuggingFaceDatasetConfig(**config_dict) - def download_dataset(self, storage_uri_parsed: ParseResult): + def download_dataset(self): + storage_uri_parsed = urlparse(self.config.storage_uri) dataset_uri = storage_uri_parsed.netloc + storage_uri_parsed.path + logging.info(f"Downloading dataset: {dataset_uri}") logging.info("-" * 40) diff --git a/pkg/initiailizer_v2/model/__main__.py b/pkg/initiailizer_v2/model/__main__.py index 85621a8cf6..680311cdc6 100644 --- a/pkg/initiailizer_v2/model/__main__.py +++ b/pkg/initiailizer_v2/model/__main__.py @@ -20,16 +20,12 @@ logging.error("STORAGE_URI env variable must be set.") raise e - logging.info(f"Storage URI: {storage_uri}") - - storage_uri_parsed = urlparse(storage_uri) - - match storage_uri_parsed.scheme: + match urlparse(storage_uri).scheme: # TODO (andreyvelich): Implement more model providers. case utils.HF_SCHEME: hf = HuggingFace() hf.load_config() - hf.download_model(storage_uri_parsed) + hf.download_model() case _: logging.error( f"STORAGE_URI must have the valid model provider. STORAGE_URI: {storage_uri}" diff --git a/pkg/initiailizer_v2/model/config.py b/pkg/initiailizer_v2/model/config.py index 827a08c456..c2ef7a6da4 100644 --- a/pkg/initiailizer_v2/model/config.py +++ b/pkg/initiailizer_v2/model/config.py @@ -5,5 +5,5 @@ # TODO (andreyvelich): This should be moved under Training V2 SDK. @dataclass class HuggingFaceModelInputConfig: - invalid: str + storage_uri: str access_token: Optional[str] = None diff --git a/pkg/initiailizer_v2/model/huggingface.py b/pkg/initiailizer_v2/model/huggingface.py index d6d13ae3f4..70f1cf251c 100644 --- a/pkg/initiailizer_v2/model/huggingface.py +++ b/pkg/initiailizer_v2/model/huggingface.py @@ -1,5 +1,5 @@ import logging -from urllib.parse import ParseResult +from urllib.parse import urlparse import huggingface_hub @@ -23,8 +23,10 @@ def load_config(self): logging.info(f"Config for HuggingFace model initiailizer: {config_dict}") self.config = HuggingFaceModelInputConfig(**config_dict) - def download_model(self, storage_uri_parsed: ParseResult): + def download_model(self): + storage_uri_parsed = urlparse(self.config.storage_uri) model_uri = storage_uri_parsed.netloc + storage_uri_parsed.path + logging.info(f"Downloading model: {model_uri}") logging.info("-" * 40) From 468500d1286ee8d0d238591d53a57255e30ff152 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Wed, 23 Oct 2024 23:39:31 +0100 Subject: [PATCH 4/6] Update .gitignore Co-authored-by: Kevin Hannon Signed-off-by: Andrey Velichkevich --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 30b973c743..293514c8b3 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,7 @@ cover.out .vscode/ __debug_bin -# Python chache files +# Python cache files __pycache__/ # Emacs temporary files From 5c398124f19c1a39cbfe894e514247cf8168b265 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Fri, 25 Oct 2024 03:03:43 +0100 Subject: [PATCH 5/6] Fix the misspelling for initializer Signed-off-by: Andrey Velichkevich --- .github/workflows/publish-core-images.yaml | 8 ++++---- .../model => initializer_v2/dataset}/Dockerfile | 6 +++--- .../dataset/requirements.txt | 0 .../dataset => initializer_v2/model}/Dockerfile | 6 +++--- .../model/requirements.txt | 0 .../dataset/__main__.py | 4 ++-- pkg/{initiailizer_v2 => initializer_v2}/dataset/config.py | 0 .../dataset/huggingface.py | 6 +++--- pkg/{initiailizer_v2 => initializer_v2}/model/__main__.py | 4 ++-- pkg/{initiailizer_v2 => initializer_v2}/model/config.py | 0 .../model/huggingface.py | 6 +++--- pkg/{initiailizer_v2 => initializer_v2}/utils/__init__.py | 0 pkg/{initiailizer_v2 => initializer_v2}/utils/utils.py | 0 13 files changed, 20 insertions(+), 20 deletions(-) rename cmd/{initiailizer_v2/model => initializer_v2/dataset}/Dockerfile (56%) rename cmd/{initiailizer_v2 => initializer_v2}/dataset/requirements.txt (100%) rename cmd/{initiailizer_v2/dataset => initializer_v2/model}/Dockerfile (55%) rename cmd/{initiailizer_v2 => initializer_v2}/model/requirements.txt (100%) rename pkg/{initiailizer_v2 => initializer_v2}/dataset/__main__.py (88%) rename pkg/{initiailizer_v2 => initializer_v2}/dataset/config.py (100%) rename pkg/{initiailizer_v2 => initializer_v2}/dataset/huggingface.py (85%) rename pkg/{initiailizer_v2 => initializer_v2}/model/__main__.py (88%) rename pkg/{initiailizer_v2 => initializer_v2}/model/config.py (100%) rename pkg/{initiailizer_v2 => initializer_v2}/model/huggingface.py (86%) rename pkg/{initiailizer_v2 => initializer_v2}/utils/__init__.py (100%) rename pkg/{initiailizer_v2 => initializer_v2}/utils/utils.py (100%) diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 22426b94ec..5835a38126 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -30,12 +30,12 @@ jobs: dockerfile: cmd/training-operator.v2alpha1/Dockerfile platforms: linux/amd64,linux/arm64,linux/ppc64le tag-prefix: v2alpha1 - - component-name: model-initiailizer-v2 - dockerfile: cmd/initiailizer_v2/model/Dockerfile + - component-name: model-initializer-v2 + dockerfile: cmd/initializer_v2/model/Dockerfile platforms: linux/amd64,linux/arm64 tag-prefix: v2 - - component-name: dataset-initiailizer-v2 - dockerfile: cmd/initiailizer_v2/dataset/Dockerfile + - component-name: dataset-initializer-v2 + dockerfile: cmd/initializer_v2/dataset/Dockerfile platforms: linux/amd64,linux/arm64 tag-prefix: v2 - component-name: kubectl-delivery diff --git a/cmd/initiailizer_v2/model/Dockerfile b/cmd/initializer_v2/dataset/Dockerfile similarity index 56% rename from cmd/initiailizer_v2/model/Dockerfile rename to cmd/initializer_v2/dataset/Dockerfile index 2960d58cb2..5bd05b8ac8 100644 --- a/cmd/initiailizer_v2/model/Dockerfile +++ b/cmd/initializer_v2/dataset/Dockerfile @@ -3,11 +3,11 @@ FROM python:3.11-alpine WORKDIR /workspace # Copy the required Python modules. -COPY cmd/initiailizer_v2/model/requirements.txt . +COPY cmd/initializer_v2/dataset/requirements.txt . COPY sdk/python/kubeflow sdk/python/kubeflow -COPY pkg/initiailizer_v2 pkg/initiailizer_v2 +COPY pkg/initializer_v2 pkg/initializer_v2 # Install the needed packages. RUN pip install -r requirements.txt -ENTRYPOINT ["python", "-m", "pkg.initiailizer_v2.model"] +ENTRYPOINT ["python", "-m", "pkg.initializer_v2.dataset"] diff --git a/cmd/initiailizer_v2/dataset/requirements.txt b/cmd/initializer_v2/dataset/requirements.txt similarity index 100% rename from cmd/initiailizer_v2/dataset/requirements.txt rename to cmd/initializer_v2/dataset/requirements.txt diff --git a/cmd/initiailizer_v2/dataset/Dockerfile b/cmd/initializer_v2/model/Dockerfile similarity index 55% rename from cmd/initiailizer_v2/dataset/Dockerfile rename to cmd/initializer_v2/model/Dockerfile index 06b6b11076..4b010f6ce9 100644 --- a/cmd/initiailizer_v2/dataset/Dockerfile +++ b/cmd/initializer_v2/model/Dockerfile @@ -3,11 +3,11 @@ FROM python:3.11-alpine WORKDIR /workspace # Copy the required Python modules. -COPY cmd/initiailizer_v2/dataset/requirements.txt . +COPY cmd/initializer_v2/model/requirements.txt . COPY sdk/python/kubeflow sdk/python/kubeflow -COPY pkg/initiailizer_v2 pkg/initiailizer_v2 +COPY pkg/initializer_v2 pkg/initializer_v2 # Install the needed packages. RUN pip install -r requirements.txt -ENTRYPOINT ["python", "-m", "pkg.initiailizer_v2.dataset"] +ENTRYPOINT ["python", "-m", "pkg.initializer_v2.model"] diff --git a/cmd/initiailizer_v2/model/requirements.txt b/cmd/initializer_v2/model/requirements.txt similarity index 100% rename from cmd/initiailizer_v2/model/requirements.txt rename to cmd/initializer_v2/model/requirements.txt diff --git a/pkg/initiailizer_v2/dataset/__main__.py b/pkg/initializer_v2/dataset/__main__.py similarity index 88% rename from pkg/initiailizer_v2/dataset/__main__.py rename to pkg/initializer_v2/dataset/__main__.py index 8b7c9fb766..2be2dd9cb8 100644 --- a/pkg/initiailizer_v2/dataset/__main__.py +++ b/pkg/initializer_v2/dataset/__main__.py @@ -2,8 +2,8 @@ import os from urllib.parse import urlparse -import pkg.initiailizer_v2.utils.utils as utils -from pkg.initiailizer_v2.dataset.huggingface import HuggingFace +import pkg.initializer_v2.utils.utils as utils +from pkg.initializer_v2.dataset.huggingface import HuggingFace logging.basicConfig( format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", diff --git a/pkg/initiailizer_v2/dataset/config.py b/pkg/initializer_v2/dataset/config.py similarity index 100% rename from pkg/initiailizer_v2/dataset/config.py rename to pkg/initializer_v2/dataset/config.py diff --git a/pkg/initiailizer_v2/dataset/huggingface.py b/pkg/initializer_v2/dataset/huggingface.py similarity index 85% rename from pkg/initiailizer_v2/dataset/huggingface.py rename to pkg/initializer_v2/dataset/huggingface.py index d4f1280c69..b5a57bdd57 100644 --- a/pkg/initiailizer_v2/dataset/huggingface.py +++ b/pkg/initializer_v2/dataset/huggingface.py @@ -3,11 +3,11 @@ import huggingface_hub -import pkg.initiailizer_v2.utils.utils as utils +import pkg.initializer_v2.utils.utils as utils # TODO (andreyvelich): This should be moved to SDK V2 constants. import sdk.python.kubeflow.storage_initializer.constants as constants -from pkg.initiailizer_v2.dataset.config import HuggingFaceDatasetConfig +from pkg.initializer_v2.dataset.config import HuggingFaceDatasetConfig logging.basicConfig( format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", @@ -20,7 +20,7 @@ class HuggingFace(utils.DatasetProvider): def load_config(self): config_dict = utils.get_config_from_env(HuggingFaceDatasetConfig) - logging.info(f"Config for HuggingFace dataset initiailizer: {config_dict}") + logging.info(f"Config for HuggingFace dataset initializer: {config_dict}") self.config = HuggingFaceDatasetConfig(**config_dict) def download_dataset(self): diff --git a/pkg/initiailizer_v2/model/__main__.py b/pkg/initializer_v2/model/__main__.py similarity index 88% rename from pkg/initiailizer_v2/model/__main__.py rename to pkg/initializer_v2/model/__main__.py index 680311cdc6..eb3126385a 100644 --- a/pkg/initiailizer_v2/model/__main__.py +++ b/pkg/initializer_v2/model/__main__.py @@ -2,8 +2,8 @@ import os from urllib.parse import urlparse -import pkg.initiailizer_v2.utils.utils as utils -from pkg.initiailizer_v2.model.huggingface import HuggingFace +import pkg.initializer_v2.utils.utils as utils +from pkg.initializer_v2.model.huggingface import HuggingFace logging.basicConfig( format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", diff --git a/pkg/initiailizer_v2/model/config.py b/pkg/initializer_v2/model/config.py similarity index 100% rename from pkg/initiailizer_v2/model/config.py rename to pkg/initializer_v2/model/config.py diff --git a/pkg/initiailizer_v2/model/huggingface.py b/pkg/initializer_v2/model/huggingface.py similarity index 86% rename from pkg/initiailizer_v2/model/huggingface.py rename to pkg/initializer_v2/model/huggingface.py index 70f1cf251c..fae4b45c43 100644 --- a/pkg/initiailizer_v2/model/huggingface.py +++ b/pkg/initializer_v2/model/huggingface.py @@ -3,11 +3,11 @@ import huggingface_hub -import pkg.initiailizer_v2.utils.utils as utils +import pkg.initializer_v2.utils.utils as utils # TODO (andreyvelich): This should be moved to SDK V2 constants. import sdk.python.kubeflow.storage_initializer.constants as constants -from pkg.initiailizer_v2.model.config import HuggingFaceModelInputConfig +from pkg.initializer_v2.model.config import HuggingFaceModelInputConfig logging.basicConfig( format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", @@ -20,7 +20,7 @@ class HuggingFace(utils.ModelProvider): def load_config(self): config_dict = utils.get_config_from_env(HuggingFaceModelInputConfig) - logging.info(f"Config for HuggingFace model initiailizer: {config_dict}") + logging.info(f"Config for HuggingFace model initializer: {config_dict}") self.config = HuggingFaceModelInputConfig(**config_dict) def download_model(self): diff --git a/pkg/initiailizer_v2/utils/__init__.py b/pkg/initializer_v2/utils/__init__.py similarity index 100% rename from pkg/initiailizer_v2/utils/__init__.py rename to pkg/initializer_v2/utils/__init__.py diff --git a/pkg/initiailizer_v2/utils/utils.py b/pkg/initializer_v2/utils/utils.py similarity index 100% rename from pkg/initiailizer_v2/utils/utils.py rename to pkg/initializer_v2/utils/utils.py From 2e8a518dab179867e1803d96511373bca875072c Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Fri, 25 Oct 2024 12:58:51 +0100 Subject: [PATCH 6/6] Add .pt and .pth to ignore_patterns Signed-off-by: Andrey Velichkevich --- pkg/initializer_v2/model/huggingface.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/initializer_v2/model/huggingface.py b/pkg/initializer_v2/model/huggingface.py index fae4b45c43..46a6347e8f 100644 --- a/pkg/initializer_v2/model/huggingface.py +++ b/pkg/initializer_v2/model/huggingface.py @@ -33,12 +33,15 @@ def download_model(self): if self.config.access_token: huggingface_hub.login(self.config.access_token) - # TODO (andreyvelich): We should verify these patterns for different models. + # TODO (andreyvelich): We should consider to follow vLLM approach with allow patterns. + # Ref: https://github.com/kubeflow/training-operator/pull/2303#discussion_r1815913663 + # TODO (andreyvelich): We should update patterns for Mistral model + # Ref: https://github.com/kubeflow/training-operator/pull/2303#discussion_r1815914270 huggingface_hub.snapshot_download( repo_id=model_uri, local_dir=constants.VOLUME_PATH_MODEL, allow_patterns=["*.json", "*.safetensors", "*.model"], - ignore_patterns=["*.msgpack", "*.h5", "*.bin"], + ignore_patterns=["*.msgpack", "*.h5", "*.bin", ".pt", ".pth"], ) logging.info("Model has been downloaded")