diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 690248a07e..5835a38126 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -30,6 +30,14 @@ jobs: dockerfile: cmd/training-operator.v2alpha1/Dockerfile platforms: linux/amd64,linux/arm64,linux/ppc64le tag-prefix: v2alpha1 + - component-name: model-initializer-v2 + dockerfile: cmd/initializer_v2/model/Dockerfile + platforms: linux/amd64,linux/arm64 + tag-prefix: v2 + - component-name: dataset-initializer-v2 + dockerfile: cmd/initializer_v2/dataset/Dockerfile + platforms: linux/amd64,linux/arm64 + tag-prefix: v2 - component-name: kubectl-delivery dockerfile: build/images/kubectl-delivery/Dockerfile platforms: linux/amd64,linux/arm64,linux/ppc64le diff --git a/.gitignore b/.gitignore index 2880c6d9e8..293514c8b3 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,8 @@ cover.out .vscode/ __debug_bin -# Compiled python files. -*.pyc +# Python cache files +__pycache__/ # Emacs temporary files *~ diff --git a/cmd/initializer_v2/dataset/Dockerfile b/cmd/initializer_v2/dataset/Dockerfile new file mode 100644 index 0000000000..5bd05b8ac8 --- /dev/null +++ b/cmd/initializer_v2/dataset/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-alpine + +WORKDIR /workspace + +# Copy the required Python modules. +COPY cmd/initializer_v2/dataset/requirements.txt . +COPY sdk/python/kubeflow sdk/python/kubeflow +COPY pkg/initializer_v2 pkg/initializer_v2 + +# Install the needed packages. +RUN pip install -r requirements.txt + +ENTRYPOINT ["python", "-m", "pkg.initializer_v2.dataset"] diff --git a/cmd/initializer_v2/dataset/requirements.txt b/cmd/initializer_v2/dataset/requirements.txt new file mode 100644 index 0000000000..6c9013ec2c --- /dev/null +++ b/cmd/initializer_v2/dataset/requirements.txt @@ -0,0 +1 @@ +huggingface_hub==0.23.4 diff --git a/cmd/initializer_v2/model/Dockerfile b/cmd/initializer_v2/model/Dockerfile new file mode 100644 index 0000000000..4b010f6ce9 --- /dev/null +++ b/cmd/initializer_v2/model/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-alpine + +WORKDIR /workspace + +# Copy the required Python modules. +COPY cmd/initializer_v2/model/requirements.txt . +COPY sdk/python/kubeflow sdk/python/kubeflow +COPY pkg/initializer_v2 pkg/initializer_v2 + +# Install the needed packages. +RUN pip install -r requirements.txt + +ENTRYPOINT ["python", "-m", "pkg.initializer_v2.model"] diff --git a/cmd/initializer_v2/model/requirements.txt b/cmd/initializer_v2/model/requirements.txt new file mode 100644 index 0000000000..6c9013ec2c --- /dev/null +++ b/cmd/initializer_v2/model/requirements.txt @@ -0,0 +1 @@ +huggingface_hub==0.23.4 diff --git a/pkg/initializer_v2/dataset/__main__.py b/pkg/initializer_v2/dataset/__main__.py new file mode 100644 index 0000000000..2be2dd9cb8 --- /dev/null +++ b/pkg/initializer_v2/dataset/__main__.py @@ -0,0 +1,31 @@ +import logging +import os +from urllib.parse import urlparse + +import pkg.initializer_v2.utils.utils as utils +from pkg.initializer_v2.dataset.huggingface import HuggingFace + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + +if __name__ == "__main__": + logging.info("Starting dataset initialization") + + try: + storage_uri = os.environ[utils.STORAGE_URI_ENV] + except Exception as e: + logging.error("STORAGE_URI env variable must be set.") + raise e + + match urlparse(storage_uri).scheme: + # TODO (andreyvelich): Implement more dataset providers. + case utils.HF_SCHEME: + hf = HuggingFace() + hf.load_config() + hf.download_dataset() + case _: + logging.error("STORAGE_URI must have the valid dataset provider") + raise Exception diff --git a/pkg/initializer_v2/dataset/config.py b/pkg/initializer_v2/dataset/config.py new file mode 100644 index 0000000000..e9af31e1ef --- /dev/null +++ b/pkg/initializer_v2/dataset/config.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass +from typing import Optional + + +# TODO (andreyvelich): This should be moved under Training V2 SDK. +@dataclass +class HuggingFaceDatasetConfig: + storage_uri: str + access_token: Optional[str] = None diff --git a/pkg/initializer_v2/dataset/huggingface.py b/pkg/initializer_v2/dataset/huggingface.py new file mode 100644 index 0000000000..b5a57bdd57 --- /dev/null +++ b/pkg/initializer_v2/dataset/huggingface.py @@ -0,0 +1,42 @@ +import logging +from urllib.parse import urlparse + +import huggingface_hub + +import pkg.initializer_v2.utils.utils as utils + +# TODO (andreyvelich): This should be moved to SDK V2 constants. +import sdk.python.kubeflow.storage_initializer.constants as constants +from pkg.initializer_v2.dataset.config import HuggingFaceDatasetConfig + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + + +class HuggingFace(utils.DatasetProvider): + + def load_config(self): + config_dict = utils.get_config_from_env(HuggingFaceDatasetConfig) + logging.info(f"Config for HuggingFace dataset initializer: {config_dict}") + self.config = HuggingFaceDatasetConfig(**config_dict) + + def download_dataset(self): + storage_uri_parsed = urlparse(self.config.storage_uri) + dataset_uri = storage_uri_parsed.netloc + storage_uri_parsed.path + + logging.info(f"Downloading dataset: {dataset_uri}") + logging.info("-" * 40) + + if self.config.access_token: + huggingface_hub.login(self.config.access_token) + + huggingface_hub.snapshot_download( + repo_id=dataset_uri, + repo_type="dataset", + local_dir=constants.VOLUME_PATH_DATASET, + ) + + logging.info("Dataset has been downloaded") diff --git a/pkg/initializer_v2/model/__main__.py b/pkg/initializer_v2/model/__main__.py new file mode 100644 index 0000000000..eb3126385a --- /dev/null +++ b/pkg/initializer_v2/model/__main__.py @@ -0,0 +1,33 @@ +import logging +import os +from urllib.parse import urlparse + +import pkg.initializer_v2.utils.utils as utils +from pkg.initializer_v2.model.huggingface import HuggingFace + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + +if __name__ == "__main__": + logging.info("Starting pre-trained model initialization") + + try: + storage_uri = os.environ[utils.STORAGE_URI_ENV] + except Exception as e: + logging.error("STORAGE_URI env variable must be set.") + raise e + + match urlparse(storage_uri).scheme: + # TODO (andreyvelich): Implement more model providers. + case utils.HF_SCHEME: + hf = HuggingFace() + hf.load_config() + hf.download_model() + case _: + logging.error( + f"STORAGE_URI must have the valid model provider. STORAGE_URI: {storage_uri}" + ) + raise Exception diff --git a/pkg/initializer_v2/model/config.py b/pkg/initializer_v2/model/config.py new file mode 100644 index 0000000000..c2ef7a6da4 --- /dev/null +++ b/pkg/initializer_v2/model/config.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass +from typing import Optional + + +# TODO (andreyvelich): This should be moved under Training V2 SDK. +@dataclass +class HuggingFaceModelInputConfig: + storage_uri: str + access_token: Optional[str] = None diff --git a/pkg/initializer_v2/model/huggingface.py b/pkg/initializer_v2/model/huggingface.py new file mode 100644 index 0000000000..46a6347e8f --- /dev/null +++ b/pkg/initializer_v2/model/huggingface.py @@ -0,0 +1,47 @@ +import logging +from urllib.parse import urlparse + +import huggingface_hub + +import pkg.initializer_v2.utils.utils as utils + +# TODO (andreyvelich): This should be moved to SDK V2 constants. +import sdk.python.kubeflow.storage_initializer.constants as constants +from pkg.initializer_v2.model.config import HuggingFaceModelInputConfig + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.INFO, +) + + +class HuggingFace(utils.ModelProvider): + + def load_config(self): + config_dict = utils.get_config_from_env(HuggingFaceModelInputConfig) + logging.info(f"Config for HuggingFace model initializer: {config_dict}") + self.config = HuggingFaceModelInputConfig(**config_dict) + + def download_model(self): + storage_uri_parsed = urlparse(self.config.storage_uri) + model_uri = storage_uri_parsed.netloc + storage_uri_parsed.path + + logging.info(f"Downloading model: {model_uri}") + logging.info("-" * 40) + + if self.config.access_token: + huggingface_hub.login(self.config.access_token) + + # TODO (andreyvelich): We should consider to follow vLLM approach with allow patterns. + # Ref: https://github.com/kubeflow/training-operator/pull/2303#discussion_r1815913663 + # TODO (andreyvelich): We should update patterns for Mistral model + # Ref: https://github.com/kubeflow/training-operator/pull/2303#discussion_r1815914270 + huggingface_hub.snapshot_download( + repo_id=model_uri, + local_dir=constants.VOLUME_PATH_MODEL, + allow_patterns=["*.json", "*.safetensors", "*.model"], + ignore_patterns=["*.msgpack", "*.h5", "*.bin", ".pt", ".pth"], + ) + + logging.info("Model has been downloaded") diff --git a/pkg/initializer_v2/utils/__init__.py b/pkg/initializer_v2/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pkg/initializer_v2/utils/utils.py b/pkg/initializer_v2/utils/utils.py new file mode 100644 index 0000000000..aef7262f78 --- /dev/null +++ b/pkg/initializer_v2/utils/utils.py @@ -0,0 +1,37 @@ +import os +from abc import ABC, abstractmethod +from dataclasses import fields +from typing import Dict + +STORAGE_URI_ENV = "STORAGE_URI" +HF_SCHEME = "hf" + + +class ModelProvider(ABC): + @abstractmethod + def load_config(self): + raise NotImplementedError() + + @abstractmethod + def download_model(self): + raise NotImplementedError() + + +class DatasetProvider(ABC): + @abstractmethod + def load_config(self): + raise NotImplementedError() + + @abstractmethod + def download_dataset(self): + raise NotImplementedError() + + +# Get DataClass config from the environment variables. +# Env names must be equal to the DataClass parameters. +def get_config_from_env(config) -> Dict[str, str]: + config_from_env = {} + for field in fields(config): + config_from_env[field.name] = os.getenv(field.name.upper()) + + return config_from_env