From 2a882d7004ce5d0f47d67de4033c89d7dbf28900 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 20 Jul 2024 23:18:41 -0700 Subject: [PATCH 01/53] update tune api for llm hyperparameters optimization Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 381 ++++++++++++++---- .../kubeflow/katib/constants/constants.py | 25 ++ .../v1beta1/kubeflow/katib/utils/utils.py | 43 ++ .../v1beta1/kubeflow/trainer/Dockerfile | 17 + .../kubeflow/trainer/hf_llm_optimization.py | 196 +++++++++ .../v1beta1/kubeflow/trainer/requirements.txt | 4 + sdk/python/v1beta1/setup.py | 3 + 7 files changed, 582 insertions(+), 87 deletions(-) create mode 100644 sdk/python/v1beta1/kubeflow/trainer/Dockerfile create mode 100644 sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py create mode 100644 sdk/python/v1beta1/kubeflow/trainer/requirements.txt diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 8be9e52f6da..e7d6d49bcda 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -17,6 +17,9 @@ import textwrap import time from typing import Any, Callable, Dict, List, Optional, Union +import json +import logging +logger = logging.getLogger(__name__) import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -59,6 +62,7 @@ def __init__( k8s_client = client.ApiClient(client_configuration) self.custom_api = client.CustomObjectsApi(k8s_client) + self.core_api = client.CoreV1Api(k8s_client) self.api_client = ApiClient() self.namespace = namespace @@ -153,9 +157,16 @@ def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - objective: Callable, - parameters: Dict[str, Any], - base_image: str = constants.BASE_IMAGE_TENSORFLOW, + model_provider_parameters: Optional[Any] = None, + dataset_provider_parameters: Optional[Any] = None, + storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { + "size": constants.PVC_DEFAULT_SIZE, + "storage_class": None, + "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, + }, + objective: Optional[Callable] = None, + base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, + trainer_parameters = None, namespace: Optional[str] = None, env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] @@ -176,23 +187,39 @@ def tune( packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", ): - """Create HyperParameter Tuning Katib Experiment from the objective function. + """Create HyperParameter Tuning Katib Experiment using one of the following options: + - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + - Custom objective function: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. `trainer_parameters` should be a dictionary to define the search space for these parameters. Args: name: Name for the Experiment. + model_provider_parameters: Parameters for the model provider in the Storage Initializer. + For example, HuggingFace model name and Transformer type for that model, like: AutoModelForSequenceClassification. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. + dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. + For example, name of the HuggingFace dataset or AWS S3 configuration. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams` + storage_config: Configuration for Storage Initializer PVC to download pre-trained model and dataset. + You can configure PVC size and storage class name in this argument. objective: Objective function that Katib uses to train the model. This function must be Callable and it must have only one dict argument. Katib uses this argument to send HyperParameters to the function. The function should not use any code declared outside of the function definition. Import statements must be added inside the function. - parameters: Dict of HyperParameters to tune your Experiment. You - should use Katib SDK to define the search space for these parameters. - - For example: `parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` - - Also, you can use these parameters to define input for your - objective function. base_image: Image to use when executing the objective function. + trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. + You should use the Katib SDK to define the search space for these parameters. + If you choose to use external models and datasets, it should be of type `HuggingFaceTrainerParams`. For example: + ``` + trainer_parameters = HuggingFaceTrainerParams( + training_parameters = transformers.TrainingArguments( + learning_rate = katib.search.double(min=0.1, max=0.2), + ), + ), + ``` + If you choose a custom objective function, it should be a dictionary. For example: + ``` + trainer_parameters = {"lr": katib.search.double(min=0.1, max=0.2)} + ``` + Also, you can use these parameters to define input for training the external models or your custom objective function. namespace: Namespace for the Experiment. env_per_trial: Environment variable(s) to be attached to each trial container. You can specify a dictionary as a mapping object representing the environment @@ -244,6 +271,24 @@ def tune( RuntimeError: Failed to create Katib Experiment. """ + print( + "Thank you for using `tune` API for LLMs hyperparameters optimization. This feature is in alpha stage Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or Kubeflow Katib GitHub." + ) + + if ( + ((model_provider_parameters is not None) and (dataset_provider_parameters is not None)) == (objective is not None) + ): + raise ValueError( + "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " + "You should only specify one of the following options: 1) `model_provider_parameters` and `dataset_provider_parameters`; 2) `objective`." + ) + + if ( + not name + or not trainer_parameters + ): + raise ValueError("One of the required parameters is None") + namespace = namespace or self.namespace # Create Katib Experiment template. @@ -282,66 +327,8 @@ def tune( experiment.spec.parallel_trial_count = parallel_trial_count if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - - # Validate objective function. - utils.validate_objective_function(objective) - - # Extract objective function implementation. - objective_code = inspect.getsource(objective) - - # Objective function might be defined in some indented scope - # (e.g. in another function). We need to dedent the function code. - objective_code = textwrap.dedent(objective_code) - - # Iterate over input parameters. - input_params = {} - experiment_params = [] - trial_params = [] - for p_name, p_value in parameters.items(): - # If input parameter value is Katib Experiment parameter sample. - if isinstance(p_value, models.V1beta1ParameterSpec): - # Wrap value for the function input. - input_params[p_name] = f"${{trialParameters.{p_name}}}" - - # Add value to the Katib Experiment parameters. - p_value.name = p_name - experiment_params.append(p_value) - - # Add value to the Katib Experiment's Trial parameters. - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - else: - # Otherwise, add value to the function input. - input_params[p_name] = p_value - - # Wrap objective function to execute it from the file. For example - # def objective(parameters): - # print(f'Parameters are {parameters}') - # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" - - # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {objective_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - python3 -u $program_path/ephemeral_objective.py""" - ) - - # Add objective code to the execute script. - exec_script = exec_script.format(objective_code=objective_code) - - # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - utils.get_script_for_python_packages(packages_to_install, pip_index_url) - + exec_script - ) - + + # Add resources to the Katib Experiment. if isinstance(resources_per_trial, dict): if "gpu" in resources_per_trial: resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") @@ -351,6 +338,7 @@ def tune( limits=resources_per_trial, ) + # Add environment variables to the Katib Experiment. env = [] env_from = [] if isinstance(env_per_trial, dict): @@ -369,30 +357,249 @@ def tune( f"Incorrect value for env_per_trial: {env_per_trial}" ) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( - restart_policy="Never", - containers=[ - client.V1Container( + # Create Container and Pod specifications. + # If users choose to use a custom objective function. + if objective is not None: + # Validate objective function. + utils.validate_objective_function(objective) + + # Extract objective function implementation. + objective_code = inspect.getsource(objective) + + # Objective function might be defined in some indented scope + # (e.g. in another function). We need to dedent the function code. + objective_code = textwrap.dedent(objective_code) + + # Iterate over input parameters. + input_params = {} + experiment_params = [] + trial_params = [] + for p_name, p_value in trainer_parameters.items(): + # If input parameter value is Katib Experiment parameter sample. + if isinstance(p_value, models.V1beta1ParameterSpec): + # Wrap value for the function input. + input_params[p_name] = f"${{trialParameters.{p_name}}}" + + # Add value to the Katib Experiment parameters. + p_value.name = p_name + experiment_params.append(p_value) + + # Add value to the Katib Experiment's Trial parameters. + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) + else: + # Otherwise, add value to the function input. + input_params[p_name] = p_value + + # Wrap objective function to execute it from the file. For example + # def objective(parameters): + # print(f'Parameters are {parameters}') + # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) + objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + + # Prepare execute script template. + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {objective_code} + EOM + printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + python3 -u $program_path/ephemeral_objective.py""" + ) + + # Add objective code to the execute script. + exec_script = exec_script.format(objective_code=objective_code) + + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + utils.get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) + + # create app container spec + container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, command=["bash", "-c"], args=[exec_script], - env=env, - env_from=env_from, + env=env if env else None, + env_from=env_from if env_from else None, resources=resources_per_trial, ) - ], + + pod_spec = client.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=client.V1PodSpec( + restart_policy="Never", + containers=[container_spec], + ), + ) + + # If users choose to use external models and datasets. + else: + try: + import peft + import transformers + from kubeflow.storage_initializer.s3 import S3DatasetParams + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceDatasetParams, + HuggingFaceTrainerParams, + ) + except ImportError: + raise ImportError( + "Tune API dependencies not installed. " + + "Run: pip install -U 'kubeflow-training[huggingface]' " + ) + + # Create PVC for the Storage Initializer. + try: + self.core_api.create_namespaced_persistent_volume_claim( + namespace=namespace, + body=utils.get_pvc_spec( + pvc_name=constants.STORAGE_INITIALIZER, + namespace=namespace, + storage_config=storage_config, ), ) + except Exception as e: + pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) + # Check if the PVC with the specified name exists. + for pvc in pvc_list.items: + if pvc.metadata.name == constants.STORAGE_INITIALIZER: + print( + f"PVC '{constants.STORAGE_INITIALIZER}' already exists in namespace " + f"{namespace}." + ) + break + else: + raise RuntimeError(f"failed to create PVC. Error: {e}") + + if isinstance(model_provider_parameters, HuggingFaceModelParams): + mp = "hf" + else: + raise ValueError("Model provider parameters must be an instance of HuggingFaceModelParams.") + + if isinstance(dataset_provider_parameters, S3DatasetParams): + dp = "s3" + elif isinstance(dataset_provider_parameters, HuggingFaceDatasetParams): + dp = "hf" + else: + raise ValueError("Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams.") + + # Iterate over input parameters. + experiment_params = [] + trial_params = [] + + training_args = trainer_parameters.training_parameters + for p_name, p_value in trainer_parameters.training_parameters.to_dict().items(): + if not hasattr(training_args, p_name): + logger.warning(f"Training parameter {p_name} is not supported by the current transformer.") + continue + if isinstance(p_value, models.V1beta1ParameterSpec): + old_attr = getattr(training_args, p_name, None) + if old_attr is not None: + value = f"${{trialParameters.{p_name}}}" + setattr(training_args, p_name, value) + p_value.name = p_name + experiment_params.append(p_value) + trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + elif p_value is not None: + old_attr = getattr(training_args, p_name, None) + if old_attr is not None: + value = type(old_attr)(p_value) + setattr(training_args, p_name, value) + + lora_config = trainer_parameters.lora_config + for p_name, p_value in trainer_parameters.lora_config.__dict__.items(): + if not hasattr(lora_config, p_name): + logger.warning(f"Training parameter {p_name} is not supported by the current peft.") + continue + if isinstance(p_value, models.V1beta1ParameterSpec): + old_attr = getattr(lora_config, p_name, None) + if old_attr is not None: + value = f"${{trialParameters.{p_name}}}" + setattr(lora_config, p_name, value) + p_value.name = p_name + experiment_params.append(p_value) + trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + elif p_value is not None: + old_attr = getattr(lora_config, p_name, None) + if old_attr is not None: + value = type(old_attr)(p_value) + setattr(lora_config, p_name, value) + + # create init container spec. + init_container_spec = client.V1Container( + name=constants.STORAGE_INITIALIZER, + image=constants.STORAGE_INITIALIZER_IMAGE, + args=[ + "--model_provider", + mp, + "--model_provider_parameters", + json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + "--dataset_provider", + dp, + "--dataset_provider_parameters", + json.dumps(dataset_provider_parameters.__dict__), + ], + volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + ) + + from kubeflow.storage_initializer.constants import ( + VOLUME_PATH_DATASET, + VOLUME_PATH_MODEL, + ) + + lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) + training_args = json.dumps(training_args.to_dict()) + # create app container spec. + container_spec = client.V1Container( + name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + image=constants.TRAINER_TRANSFORMER_IMAGE, + args=[ + "--model_uri", + model_provider_parameters.model_uri, + "--transformer_type", + model_provider_parameters.transformer_type.__name__, + "--model_dir", + VOLUME_PATH_MODEL, + "--dataset_dir", + VOLUME_PATH_DATASET, + "--lora_config", + f"'{lora_config}'", + "--training_parameters", + f"'{training_args}'", + ], + volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + env=env if env else None, + env_from=env_from if env_from else None, + resources=resources_per_trial, + ) + + pod_spec = client.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=client.V1PodSpec( + restart_policy="Never", + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[constants.STORAGE_INITIALIZER_VOLUME], + ), + ) + + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, ), ) diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 9af281524cd..1fec6068d47 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -13,6 +13,8 @@ # limitations under the License. import os +from kubernetes import client +from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 @@ -56,3 +58,26 @@ BASE_IMAGE_MXNET = "docker.io/mxnet/python:1.9.1_native_py3" DEFAULT_DB_MANAGER_ADDRESS = "katib-db-manager.kubeflow:6789" + +# Constants for Tune API. +STORAGE_INITIALIZER = "storage-initializer" +# The default value for dataset and model storage PVC. +PVC_DEFAULT_SIZE = "10Gi" +# The default value for PVC access modes. +PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"] + +STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" + +STORAGE_INITIALIZER_VOLUME_MOUNT = client.V1VolumeMount( + name=STORAGE_INITIALIZER, + mount_path=INIT_CONTAINER_MOUNT_PATH, +) + +STORAGE_INITIALIZER_VOLUME = client.V1Volume( + name=STORAGE_INITIALIZER, + persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( + claim_name=STORAGE_INITIALIZER + ), +) + +TRAINER_TRANSFORMER_IMAGE = "" # Need to be built using the `trainer` file \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 97c46772611..27133df3cc2 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -20,6 +20,8 @@ from kubeflow.katib import models from kubeflow.katib.constants import constants +from typing import Any, Callable, Dict, List, Optional, Union +import transformers def is_running_in_k8s(): @@ -118,3 +120,44 @@ class FakeResponse: def __init__(self, obj): self.data = json.dumps(obj) + + +def get_pvc_spec( + pvc_name: str, + namespace: str, + storage_config: Dict[str, Optional[Union[str, List[str]]]], +): + if pvc_name is None or namespace is None: + raise ValueError("One of the required storage config argument is None") + + if "size" not in storage_config: + storage_config["size"] = constants.PVC_DEFAULT_SIZE + + if "access_modes" not in storage_config: + storage_config["access_modes"] = constants.PVC_DEFAULT_ACCESS_MODES + + pvc_spec = models.V1PersistentVolumeClaim( + api_version="v1", + kind="PersistentVolumeClaim", + metadata={"name": pvc_name, "namepsace": namespace}, + spec=models.V1PersistentVolumeClaimSpec( + access_modes=storage_config["access_modes"], + resources=models.V1ResourceRequirements( + requests={"storage": storage_config["size"]} + ), + ), + ) + + if "storage_class" in storage_config: + pvc_spec.spec.storage_class_name = storage_config["storage_class"] + + return pvc_spec + + +class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + if isinstance(obj, type): + return obj.__name__ + return json.JSONEncoder.default(self, obj) \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/Dockerfile b/sdk/python/v1beta1/kubeflow/trainer/Dockerfile new file mode 100644 index 00000000000..c55633ff713 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/trainer/Dockerfile @@ -0,0 +1,17 @@ +# Use an official Pytorch runtime as a parent image +FROM nvcr.io/nvidia/pytorch:23.10-py3 + +# Set the working directory in the container +WORKDIR /app + +# Copy the requirements.txt file into the container +COPY requirements.txt /app/requirements.txt + +# Install any needed packages specified in requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the Python package and its source code into the container +COPY . /app + +# Run storage.py when the container launches +ENTRYPOINT ["torchrun", "hf_llm_optimization.py"] \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py new file mode 100644 index 00000000000..114071c7401 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py @@ -0,0 +1,196 @@ +import argparse +import logging +from urllib.parse import urlparse +import json +import os + +from datasets import load_from_disk, Dataset +from datasets.distributed import split_dataset_by_node +from peft import LoraConfig, get_peft_model +import transformers +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoModelForImageClassification, + TrainingArguments, + DataCollatorForLanguageModeling, + Trainer, +) + + +# Configure logger. +log_formatter = logging.Formatter( + "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" +) +logger = logging.getLogger(__file__) +console_handler = logging.StreamHandler() +console_handler.setFormatter(log_formatter) +logger.addHandler(console_handler) +logger.setLevel(logging.INFO) + + +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): + # Set up the model and tokenizer + parsed_uri = urlparse(model_uri) + model_name = parsed_uri.netloc + parsed_uri.path + + model = transformer_type.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + trust_remote_code=True, + ) + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + ) + + # Freeze model parameters + for param in model.parameters(): + param.requires_grad = False + + return model, tokenizer + + +def load_and_preprocess_data(dataset_dir, transformer_type, tokenizer): + # Load and preprocess the dataset + logger.info("Load and preprocess dataset") + + if transformer_type != AutoModelForImageClassification: + dataset = load_from_disk(dataset_dir) + + logger.info(f"Dataset specification: {dataset}") + logger.info("-" * 40) + + logger.info("Tokenize dataset") + # TODO (andreyvelich): Discuss how user should set the tokenizer function. + dataset = dataset.map( + lambda x: tokenizer(x["text"], padding="max_length", truncation=True), + batched=True, + ) + else: + dataset = load_from_disk(dataset_dir) + + # Check if dataset contains `train` key. Otherwise, load full dataset to train_data. + if "train" in dataset: + train_data = dataset["train"] + else: + train_data = dataset + + try: + eval_data = dataset["eval"] or dataset["test"] + except Exception: + eval_data = None + logger.info("Evaluation dataset is not found") + + # Distribute dataset across PyTorchJob workers. + RANK = int(os.environ["RANK"]) + WORLD_SIZE = int(os.environ["WORLD_SIZE"]) + logger.info( + f"Distributed dataset across PyTorchJob workers. WORLD_SIZE: {WORLD_SIZE}, RANK: {RANK}" + ) + if isinstance(train_data, Dataset): + train_data = split_dataset_by_node( + train_data, + rank=RANK, + world_size=WORLD_SIZE, + ) + if isinstance(eval_data, Dataset): + eval_data = split_dataset_by_node( + eval_data, + rank=RANK, + world_size=WORLD_SIZE, + ) + + return train_data, eval_data + + +def setup_peft_model(model, lora_config): + # Set up the PEFT model + model.enable_input_require_grads() + model = get_peft_model(model, lora_config) + return model + + +def train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args): + # Setup the Trainer. + trainer = Trainer( + model=model, + train_dataset=train_data, + eval_dataset=eval_data, + args=train_args, + ) + + # TODO (andreyvelich): Currently, data collator is supported only for casual LM Transformer. + if transformer_type == AutoModelForCausalLM: + logger.info("Add data collector for language modeling") + logger.info("-" * 40) + trainer.data_collator = DataCollatorForLanguageModeling( + tokenizer, + pad_to_multiple_of=8, + mlm=False, + ) + + # Train the model. + train_results = trainer.train() + print(f"train_loss={train_results.training_loss}") + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Script for training a model with PEFT configuration." + ) + + parser.add_argument("--model_uri", help="model uri") + parser.add_argument("--transformer_type", help="model transformer type") + parser.add_argument("--model_dir", help="directory containing model") + parser.add_argument("--dataset_dir", help="directory containing dataset") + parser.add_argument("--lora_config", help="lora_config") + parser.add_argument( + "--training_parameters", help="hugging face training parameters" + ) + + return parser.parse_args() + + +if __name__ == "__main__": + logger.info("Starting HuggingFace LLM Trainer") + args = parse_arguments() + + train_args = TrainingArguments(**json.loads(args.training_parameters)) + reference_args = transformers.TrainingArguments(output_dir=train_args.output_dir) + for key, val in train_args.to_dict().items(): + old_attr = getattr(reference_args, key, None) + if old_attr is not None: + val = type(old_attr)(val) + setattr(train_args, key, val) + + lora_config = LoraConfig(**json.loads(args.lora_config)) + reference_lora_config = LoraConfig() + for key, val in lora_config.__dict__.items(): + old_attr = getattr(reference_lora_config, key, None) + if old_attr is not None: + val = type(old_attr)(val) + setattr(lora_config, key, val) + + transformer_type = getattr(transformers, args.transformer_type) + + logger.info("Setup model and tokenizer") + model, tokenizer = setup_model_and_tokenizer( + args.model_uri, transformer_type, args.model_dir + ) + + logger.info("Preprocess dataset") + train_data, eval_data = load_and_preprocess_data( + args.dataset_dir, transformer_type, tokenizer + ) + + logger.info("Setup LoRA config for model") + model = setup_peft_model(model, lora_config) + + logger.info("Start model training") + train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args) + + logger.info("Training is complete") \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/requirements.txt b/sdk/python/v1beta1/kubeflow/trainer/requirements.txt new file mode 100644 index 00000000000..ba76f3cdcec --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/trainer/requirements.txt @@ -0,0 +1,4 @@ +peft==0.3.0 +datasets==2.15.0 +transformers==4.38.0 +evaluate==0.4.0 \ No newline at end of file diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py index 39a4f0e2372..685c45c102e 100644 --- a/sdk/python/v1beta1/setup.py +++ b/sdk/python/v1beta1/setup.py @@ -68,4 +68,7 @@ "Topic :: Software Development :: Libraries :: Python Modules", ], install_requires=REQUIRES, + extras_require={ + "huggingface": ["kubeflow-training[huggingface]"], + }, ) From 158c8f3e32e3d5b08baea4371d17653f0655b107 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 12:14:20 -0700 Subject: [PATCH 02/53] resolve conflict Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 39 ++++--------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 5b2cf67ab21..4047fe54fe7 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -371,8 +371,13 @@ def tune( raise ValueError( f"Incorrect value for env_per_trial: {env_per_trial}" ) - -<<<<<<< HEAD + + # Add metrics collector to the Katib Experiment. + # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + experiment.spec.metrics_collector = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) + ) + # Create Container and Pod specifications. # If users choose to use a custom objective function. if objective is not None: @@ -609,43 +614,13 @@ def tune( volumes=[constants.STORAGE_INITIALIZER_VOLUME], ), ) - -======= - # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) - ) ->>>>>>> upstream/master # Create Trial specification. trial_spec = client.V1Job( api_version="batch/v1", kind="Job", spec=client.V1JobSpec( -<<<<<<< HEAD template=pod_spec, -======= - template=client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( - restart_policy="Never", - containers=[ - client.V1Container( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, - ) - ], - ), - ) ->>>>>>> upstream/master ), ) From f4a0d4e90c556e5a855a56ebb2fcbdf6268ecc68 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 13:46:19 -0700 Subject: [PATCH 03/53] fix the problem of dependency Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 9 ++------- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 7 +++++-- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 3 +-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 4047fe54fe7..6e0518bb0ab 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -572,11 +572,6 @@ def tune( volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], ) - from kubeflow.storage_initializer.constants import ( - VOLUME_PATH_DATASET, - VOLUME_PATH_MODEL, - ) - lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) # create app container spec. @@ -589,9 +584,9 @@ def tune( "--transformer_type", model_provider_parameters.transformer_type.__name__, "--model_dir", - VOLUME_PATH_MODEL, + constants.VOLUME_PATH_MODEL, "--dataset_dir", - VOLUME_PATH_DATASET, + constants.VOLUME_PATH_DATASET, "--lora_config", f"'{lora_config}'", "--training_parameters", diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 5707a39ccb9..17f5619b922 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -14,7 +14,6 @@ import os from kubernetes import client -from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 @@ -70,7 +69,9 @@ # The default value for PVC access modes. PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"] -STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" +INIT_CONTAINER_MOUNT_PATH = "/workspace" +VOLUME_PATH_DATASET = INIT_CONTAINER_MOUNT_PATH + "/dataset" +VOLUME_PATH_MODEL = INIT_CONTAINER_MOUNT_PATH + "/model" STORAGE_INITIALIZER_VOLUME_MOUNT = client.V1VolumeMount( name=STORAGE_INITIALIZER, @@ -84,4 +85,6 @@ ), ) +STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" + TRAINER_TRANSFORMER_IMAGE = "" # Need to be built using the `trainer` file \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 8985d785236..0f4a2ad263a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -21,7 +21,6 @@ from kubeflow.katib import models from kubeflow.katib.constants import constants from typing import Any, Callable, Dict, List, Optional, Union -import transformers def is_running_in_k8s(): @@ -170,4 +169,4 @@ def default(self, obj): return list(obj) if isinstance(obj, type): return obj.__name__ - return json.JSONEncoder.default(self, obj) \ No newline at end of file + return json.JSONEncoder.default(self, obj) From 7e7dd56beb44fbb61c6d421f6bfa3ac66b3701d6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 14:40:15 -0700 Subject: [PATCH 04/53] fix the format of import statement Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 17 ++++++------- .../v1beta1/kubeflow/katib/utils/utils.py | 3 +-- .../kubeflow/trainer/hf_llm_optimization.py | 24 +++++++++---------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6e0518bb0ab..7a661a5bf1b 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -13,13 +13,13 @@ # limitations under the License. import inspect +import json import logging import multiprocessing import textwrap import time from typing import Any, Callable, Dict, List, Optional, Union -import json -import logging + logger = logging.getLogger(__name__) import grpc @@ -464,14 +464,15 @@ def tune( # If users choose to use external models and datasets. else: try: + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceDatasetParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceModelParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceTrainerParams + from kubeflow.storage_initializer.s3 import S3DatasetParams import peft import transformers - from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceDatasetParams, - HuggingFaceTrainerParams, - ) except ImportError: raise ImportError( "Tune API dependencies not installed. " diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 0f4a2ad263a..2a2e2b4b4b8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -16,11 +16,10 @@ import json import os import textwrap -from typing import Any, Callable +from typing import Any, Callable, Dict, List, Optional, Union from kubeflow.katib import models from kubeflow.katib.constants import constants -from typing import Any, Callable, Dict, List, Optional, Union def is_running_in_k8s(): diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py index 114071c7401..e12d3e3a940 100644 --- a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py +++ b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py @@ -1,21 +1,21 @@ import argparse -import logging -from urllib.parse import urlparse import json +import logging import os +from urllib.parse import urlparse -from datasets import load_from_disk, Dataset +from datasets import Dataset +from datasets import load_from_disk from datasets.distributed import split_dataset_by_node -from peft import LoraConfig, get_peft_model +from peft import get_peft_model +from peft import LoraConfig import transformers -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - AutoModelForImageClassification, - TrainingArguments, - DataCollatorForLanguageModeling, - Trainer, -) +from transformers import AutoModelForCausalLM +from transformers import AutoModelForImageClassification +from transformers import AutoTokenizer +from transformers import DataCollatorForLanguageModeling +from transformers import Trainer +from transformers import TrainingArguments # Configure logger. From 62ad3850c1b57232ccfd5df3595a401d546fc1e3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 14:46:55 -0700 Subject: [PATCH 05/53] adjust the blank lines Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 1 + sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 17f5619b922..955ee07a01a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -13,6 +13,7 @@ # limitations under the License. import os + from kubernetes import client # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py index e12d3e3a940..a0050c45071 100644 --- a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py +++ b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py @@ -17,7 +17,6 @@ from transformers import Trainer from transformers import TrainingArguments - # Configure logger. log_formatter = logging.Formatter( "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" From 3f36740364decc59c389bddef9bffa7d9babd285 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 22 Jul 2024 15:36:54 -0700 Subject: [PATCH 06/53] delete the trainer to reuse it in Training Operator Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/trainer/Dockerfile | 17 -- .../kubeflow/trainer/hf_llm_optimization.py | 195 ------------------ .../v1beta1/kubeflow/trainer/requirements.txt | 4 - 3 files changed, 216 deletions(-) delete mode 100644 sdk/python/v1beta1/kubeflow/trainer/Dockerfile delete mode 100644 sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py delete mode 100644 sdk/python/v1beta1/kubeflow/trainer/requirements.txt diff --git a/sdk/python/v1beta1/kubeflow/trainer/Dockerfile b/sdk/python/v1beta1/kubeflow/trainer/Dockerfile deleted file mode 100644 index c55633ff713..00000000000 --- a/sdk/python/v1beta1/kubeflow/trainer/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -# Use an official Pytorch runtime as a parent image -FROM nvcr.io/nvidia/pytorch:23.10-py3 - -# Set the working directory in the container -WORKDIR /app - -# Copy the requirements.txt file into the container -COPY requirements.txt /app/requirements.txt - -# Install any needed packages specified in requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the Python package and its source code into the container -COPY . /app - -# Run storage.py when the container launches -ENTRYPOINT ["torchrun", "hf_llm_optimization.py"] \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py deleted file mode 100644 index a0050c45071..00000000000 --- a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py +++ /dev/null @@ -1,195 +0,0 @@ -import argparse -import json -import logging -import os -from urllib.parse import urlparse - -from datasets import Dataset -from datasets import load_from_disk -from datasets.distributed import split_dataset_by_node -from peft import get_peft_model -from peft import LoraConfig -import transformers -from transformers import AutoModelForCausalLM -from transformers import AutoModelForImageClassification -from transformers import AutoTokenizer -from transformers import DataCollatorForLanguageModeling -from transformers import Trainer -from transformers import TrainingArguments - -# Configure logger. -log_formatter = logging.Formatter( - "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" -) -logger = logging.getLogger(__file__) -console_handler = logging.StreamHandler() -console_handler.setFormatter(log_formatter) -logger.addHandler(console_handler) -logger.setLevel(logging.INFO) - - -def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): - # Set up the model and tokenizer - parsed_uri = urlparse(model_uri) - model_name = parsed_uri.netloc + parsed_uri.path - - model = transformer_type.from_pretrained( - pretrained_model_name_or_path=model_name, - cache_dir=model_dir, - local_files_only=True, - trust_remote_code=True, - ) - - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=model_name, - cache_dir=model_dir, - local_files_only=True, - ) - - # Freeze model parameters - for param in model.parameters(): - param.requires_grad = False - - return model, tokenizer - - -def load_and_preprocess_data(dataset_dir, transformer_type, tokenizer): - # Load and preprocess the dataset - logger.info("Load and preprocess dataset") - - if transformer_type != AutoModelForImageClassification: - dataset = load_from_disk(dataset_dir) - - logger.info(f"Dataset specification: {dataset}") - logger.info("-" * 40) - - logger.info("Tokenize dataset") - # TODO (andreyvelich): Discuss how user should set the tokenizer function. - dataset = dataset.map( - lambda x: tokenizer(x["text"], padding="max_length", truncation=True), - batched=True, - ) - else: - dataset = load_from_disk(dataset_dir) - - # Check if dataset contains `train` key. Otherwise, load full dataset to train_data. - if "train" in dataset: - train_data = dataset["train"] - else: - train_data = dataset - - try: - eval_data = dataset["eval"] or dataset["test"] - except Exception: - eval_data = None - logger.info("Evaluation dataset is not found") - - # Distribute dataset across PyTorchJob workers. - RANK = int(os.environ["RANK"]) - WORLD_SIZE = int(os.environ["WORLD_SIZE"]) - logger.info( - f"Distributed dataset across PyTorchJob workers. WORLD_SIZE: {WORLD_SIZE}, RANK: {RANK}" - ) - if isinstance(train_data, Dataset): - train_data = split_dataset_by_node( - train_data, - rank=RANK, - world_size=WORLD_SIZE, - ) - if isinstance(eval_data, Dataset): - eval_data = split_dataset_by_node( - eval_data, - rank=RANK, - world_size=WORLD_SIZE, - ) - - return train_data, eval_data - - -def setup_peft_model(model, lora_config): - # Set up the PEFT model - model.enable_input_require_grads() - model = get_peft_model(model, lora_config) - return model - - -def train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args): - # Setup the Trainer. - trainer = Trainer( - model=model, - train_dataset=train_data, - eval_dataset=eval_data, - args=train_args, - ) - - # TODO (andreyvelich): Currently, data collator is supported only for casual LM Transformer. - if transformer_type == AutoModelForCausalLM: - logger.info("Add data collector for language modeling") - logger.info("-" * 40) - trainer.data_collator = DataCollatorForLanguageModeling( - tokenizer, - pad_to_multiple_of=8, - mlm=False, - ) - - # Train the model. - train_results = trainer.train() - print(f"train_loss={train_results.training_loss}") - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description="Script for training a model with PEFT configuration." - ) - - parser.add_argument("--model_uri", help="model uri") - parser.add_argument("--transformer_type", help="model transformer type") - parser.add_argument("--model_dir", help="directory containing model") - parser.add_argument("--dataset_dir", help="directory containing dataset") - parser.add_argument("--lora_config", help="lora_config") - parser.add_argument( - "--training_parameters", help="hugging face training parameters" - ) - - return parser.parse_args() - - -if __name__ == "__main__": - logger.info("Starting HuggingFace LLM Trainer") - args = parse_arguments() - - train_args = TrainingArguments(**json.loads(args.training_parameters)) - reference_args = transformers.TrainingArguments(output_dir=train_args.output_dir) - for key, val in train_args.to_dict().items(): - old_attr = getattr(reference_args, key, None) - if old_attr is not None: - val = type(old_attr)(val) - setattr(train_args, key, val) - - lora_config = LoraConfig(**json.loads(args.lora_config)) - reference_lora_config = LoraConfig() - for key, val in lora_config.__dict__.items(): - old_attr = getattr(reference_lora_config, key, None) - if old_attr is not None: - val = type(old_attr)(val) - setattr(lora_config, key, val) - - transformer_type = getattr(transformers, args.transformer_type) - - logger.info("Setup model and tokenizer") - model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir - ) - - logger.info("Preprocess dataset") - train_data, eval_data = load_and_preprocess_data( - args.dataset_dir, transformer_type, tokenizer - ) - - logger.info("Setup LoRA config for model") - model = setup_peft_model(model, lora_config) - - logger.info("Start model training") - train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args) - - logger.info("Training is complete") \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/requirements.txt b/sdk/python/v1beta1/kubeflow/trainer/requirements.txt deleted file mode 100644 index ba76f3cdcec..00000000000 --- a/sdk/python/v1beta1/kubeflow/trainer/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -peft==0.3.0 -datasets==2.15.0 -transformers==4.38.0 -evaluate==0.4.0 \ No newline at end of file From 9d202538f24bc5a84ee3e10fb079e7f1301b2115 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 22 Jul 2024 15:43:01 -0700 Subject: [PATCH 07/53] update constants Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 955ee07a01a..21d421b7bbc 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -88,4 +88,4 @@ STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" -TRAINER_TRANSFORMER_IMAGE = "" # Need to be built using the `trainer` file \ No newline at end of file +TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" \ No newline at end of file From dfbe793d0b909066bec81cc021480a48cdd6ccd8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 25 Jul 2024 11:38:50 -0700 Subject: [PATCH 08/53] update metrics format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7a661a5bf1b..18f6c0158e4 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -374,8 +374,13 @@ def tune( # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + source=models.V1beta1SourceSpec( + filter=models.V1beta1FilterSpec( + metrics_format=["\\'(\\w+)\\':\\s((-?\\d+)(\\.\\d+)?)"] + ) + ) ) # Create Container and Pod specifications. From 290a249426599c8cba7e09d5bbf17d765ef629bf Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 09:38:08 +0800 Subject: [PATCH 09/53] update the type of and Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 146 +++++++++++------- .../kubeflow/katib/constants/constants.py | 2 +- .../v1beta1/kubeflow/katib/utils/utils.py | 1 + sdk/python/v1beta1/setup.py | 4 +- 4 files changed, 94 insertions(+), 59 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 18f6c0158e4..568c150fdee 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -18,7 +18,7 @@ import multiprocessing import textwrap import time -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING logger = logging.getLogger(__name__) @@ -168,16 +168,16 @@ def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - model_provider_parameters: Optional[Any] = None, - dataset_provider_parameters: Optional[Any] = None, - storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { + model_provider_parameters: Optional['HuggingFaceModelParams'] = None, + dataset_provider_parameters: Optional[Union['HuggingFaceDatasetParams', 'S3DatasetParams']] = None, + storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { "size": constants.PVC_DEFAULT_SIZE, "storage_class": None, "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, }, objective: Optional[Callable] = None, base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, - trainer_parameters = None, + trainer_parameters: Union['HuggingFaceTrainerParams', Dict[str, Any]]=None, namespace: Optional[str] = None, env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] @@ -200,7 +200,7 @@ def tune( metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: - - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. - Custom objective function: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. `trainer_parameters` should be a dictionary to define the search space for these parameters. Args: @@ -276,7 +276,7 @@ def tune( to the base image packages. These packages are installed before executing the objective function. pip_index_url: The PyPI url from which to install Python packages. - metrics_collector_config: Specify the config of metrics collector, + metrics_collector_config: Specify the config of metrics collector, for example, `metrics_collector_config = {"kind": "Push"}`. Currently, we only support `StdOut` and `Push` metrics collector. @@ -289,19 +289,17 @@ def tune( print( "Thank you for using `tune` API for LLMs hyperparameters optimization. This feature is in alpha stage Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or Kubeflow Katib GitHub." ) - + if ( - ((model_provider_parameters is not None) and (dataset_provider_parameters is not None)) == (objective is not None) - ): + (model_provider_parameters is not None) + and (dataset_provider_parameters is not None) + ) == (objective is not None): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " "You should only specify one of the following options: 1) `model_provider_parameters` and `dataset_provider_parameters`; 2) `objective`." ) - - if ( - not name - or not trainer_parameters - ): + + if not name or not trainer_parameters: raise ValueError("One of the required parameters is None") namespace = namespace or self.namespace @@ -342,7 +340,7 @@ def tune( experiment.spec.parallel_trial_count = parallel_trial_count if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - + # Add resources to the Katib Experiment. if isinstance(resources_per_trial, dict): if "gpu" in resources_per_trial: @@ -371,18 +369,20 @@ def tune( raise ValueError( f"Incorrect value for env_per_trial: {env_per_trial}" ) - + # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=["\\'(\\w+)\\':\\s((-?\\d+)(\\.\\d+)?)"] ) - ) + ), ) - + # Create Container and Pod specifications. # If users choose to use a custom objective function. if objective is not None: @@ -441,21 +441,23 @@ def tune( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - utils.get_script_for_python_packages(packages_to_install, pip_index_url) + utils.get_script_for_python_packages( + packages_to_install, pip_index_url + ) + exec_script ) - + # create app container spec container_spec = client.V1Container( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, - ) - + name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + image=base_image, + command=["bash", "-c"], + args=[exec_script], + env=env if env else None, + env_from=env_from if env_from else None, + resources=resources_per_trial, + ) + pod_spec = client.V1PodTemplateSpec( metadata=models.V1ObjectMeta( annotations={"sidecar.istio.io/inject": "false"} @@ -469,12 +471,15 @@ def tune( # If users choose to use external models and datasets. else: try: - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceDatasetParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceModelParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceTrainerParams + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceTrainerParams, + ) from kubeflow.storage_initializer.s3 import S3DatasetParams import peft import transformers @@ -483,7 +488,7 @@ def tune( "Tune API dependencies not installed. " + "Run: pip install -U 'kubeflow-training[huggingface]' " ) - + # Create PVC for the Storage Initializer. try: self.core_api.create_namespaced_persistent_volume_claim( @@ -495,7 +500,9 @@ def tune( ), ) except Exception as e: - pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) + pvc_list = self.core_api.list_namespaced_persistent_volume_claim( + namespace + ) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == constants.STORAGE_INITIALIZER: @@ -506,27 +513,36 @@ def tune( break else: raise RuntimeError(f"failed to create PVC. Error: {e}") - + if isinstance(model_provider_parameters, HuggingFaceModelParams): mp = "hf" else: - raise ValueError("Model provider parameters must be an instance of HuggingFaceModelParams.") - + raise ValueError( + "Model provider parameters must be an instance of HuggingFaceModelParams." + ) + if isinstance(dataset_provider_parameters, S3DatasetParams): dp = "s3" elif isinstance(dataset_provider_parameters, HuggingFaceDatasetParams): dp = "hf" else: - raise ValueError("Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams.") - + raise ValueError( + "Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams." + ) + # Iterate over input parameters. experiment_params = [] trial_params = [] training_args = trainer_parameters.training_parameters - for p_name, p_value in trainer_parameters.training_parameters.to_dict().items(): + for ( + p_name, + p_value, + ) in trainer_parameters.training_parameters.to_dict().items(): if not hasattr(training_args, p_name): - logger.warning(f"Training parameter {p_name} is not supported by the current transformer.") + logger.warning( + f"Training parameter {p_name} is not supported by the current transformer." + ) continue if isinstance(p_value, models.V1beta1ParameterSpec): old_attr = getattr(training_args, p_name, None) @@ -535,7 +551,9 @@ def tune( setattr(training_args, p_name, value) p_value.name = p_name experiment_params.append(p_value) - trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) elif p_value is not None: old_attr = getattr(training_args, p_name, None) if old_attr is not None: @@ -545,7 +563,9 @@ def tune( lora_config = trainer_parameters.lora_config for p_name, p_value in trainer_parameters.lora_config.__dict__.items(): if not hasattr(lora_config, p_name): - logger.warning(f"Training parameter {p_name} is not supported by the current peft.") + logger.warning( + f"Training parameter {p_name} is not supported by the current peft." + ) continue if isinstance(p_value, models.V1beta1ParameterSpec): old_attr = getattr(lora_config, p_name, None) @@ -554,7 +574,9 @@ def tune( setattr(lora_config, p_name, value) p_value.name = p_name experiment_params.append(p_value) - trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) elif p_value is not None: old_attr = getattr(lora_config, p_name, None) if old_attr is not None: @@ -569,7 +591,9 @@ def tune( "--model_provider", mp, "--model_provider_parameters", - json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + json.dumps( + model_provider_parameters.__dict__, cls=utils.SetEncoder + ), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -972,7 +996,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Raise exception if Experiment is Failed. @@ -992,7 +1018,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Running condition. @@ -1003,7 +1031,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Restarting condition. @@ -1014,7 +1044,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Succeeded condition. @@ -1025,7 +1057,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Otherwise, print the current Experiment results and sleep for the pooling interval. diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 21d421b7bbc..8de550e2fe7 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -88,4 +88,4 @@ STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" -TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" \ No newline at end of file +TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 2a2e2b4b4b8..8c90a001d96 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -72,6 +72,7 @@ def print_experiment_status(experiment: models.V1beta1Experiment): print(f"Current Optimal Trial:\n {experiment.status.current_optimal_trial}") print(f"Experiment conditions:\n {experiment.status.conditions}") + def validate_metrics_value(value: Any): """Validate if the metrics value can be converted to type `float`.""" try: diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py index 757ccd4a05d..b715be4f7c3 100644 --- a/sdk/python/v1beta1/setup.py +++ b/sdk/python/v1beta1/setup.py @@ -70,6 +70,6 @@ ], install_requires=REQUIRES, extras_require={ - "huggingface": ["kubeflow-training[huggingface]"], - }, + "huggingface": ["kubeflow-training[huggingface]"], + }, ) From aba2606e010bac8a05bbd3f00544368de64e0bb9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 09:43:49 +0800 Subject: [PATCH 10/53] update the message of 'ImportError' Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 568c150fdee..1d45200b8f2 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -486,7 +486,7 @@ def tune( except ImportError: raise ImportError( "Tune API dependencies not installed. " - + "Run: pip install -U 'kubeflow-training[huggingface]' " + + "Run: pip install -U 'kubeflow-katib[huggingface]' " ) # Create PVC for the Storage Initializer. From eaf0193a9edeaf8e39fc72df8e0bc156b45b72ad Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 09:48:08 +0800 Subject: [PATCH 11/53] add TODO of PVC creation Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 1d45200b8f2..d10890238da 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -490,6 +490,7 @@ def tune( ) # Create PVC for the Storage Initializer. + # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, From 62355a2bc3acb142d00bc7401dc6e2982f87a529 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 10:25:02 +0800 Subject: [PATCH 12/53] update the name of pvc Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d10890238da..7667f6e30fd 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -495,7 +495,7 @@ def tune( self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, body=utils.get_pvc_spec( - pvc_name=constants.STORAGE_INITIALIZER, + pvc_name=name, namespace=namespace, storage_config=storage_config, ), @@ -506,9 +506,9 @@ def tune( ) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: - if pvc.metadata.name == constants.STORAGE_INITIALIZER: + if pvc.metadata.name == name: print( - f"PVC '{constants.STORAGE_INITIALIZER}' already exists in namespace " + f"PVC '{name}' already exists in namespace " f"{namespace}." ) break From 7b2b40eaa5fb89e0cc8b3b57787acbad2493ce89 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 16:31:41 +0800 Subject: [PATCH 13/53] reuse constants from Training Operator Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 29 +++++++++++++------ .../kubeflow/katib/constants/constants.py | 22 -------------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7667f6e30fd..484ae04167a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -471,6 +471,17 @@ def tune( # If users choose to use external models and datasets. else: try: + from kubeflow.training.constants.constants import( + STORAGE_INITIALIZER, + STORAGE_INITIALIZER_VOLUME_MOUNT, + STORAGE_INITIALIZER_VOLUME, + STORAGE_INITIALIZER_IMAGE, + TRAINER_TRANSFORMER_IMAGE, + ) + from kubeflow.storage_initializer.constants import ( + VOLUME_PATH_DATASET, + VOLUME_PATH_MODEL, + ) from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, ) @@ -586,8 +597,8 @@ def tune( # create init container spec. init_container_spec = client.V1Container( - name=constants.STORAGE_INITIALIZER, - image=constants.STORAGE_INITIALIZER_IMAGE, + name=STORAGE_INITIALIZER, + image=STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, @@ -600,7 +611,7 @@ def tune( "--dataset_provider_parameters", json.dumps(dataset_provider_parameters.__dict__), ], - volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) @@ -608,22 +619,22 @@ def tune( # create app container spec. container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=constants.TRAINER_TRANSFORMER_IMAGE, + image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, "--transformer_type", model_provider_parameters.transformer_type.__name__, "--model_dir", - constants.VOLUME_PATH_MODEL, + VOLUME_PATH_MODEL, "--dataset_dir", - constants.VOLUME_PATH_DATASET, + VOLUME_PATH_DATASET, "--lora_config", f"'{lora_config}'", "--training_parameters", f"'{training_args}'", ], - volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], env=env if env else None, env_from=env_from if env_from else None, resources=resources_per_trial, @@ -637,7 +648,7 @@ def tune( restart_policy="Never", containers=[container_spec], init_containers=[init_container_spec], - volumes=[constants.STORAGE_INITIALIZER_VOLUME], + volumes=[STORAGE_INITIALIZER_VOLUME], ), ) @@ -957,7 +968,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 600, + timeout: int = 6000, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 8de550e2fe7..fa4e5882727 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -63,29 +63,7 @@ DEFAULT_DB_MANAGER_ADDRESS = "katib-db-manager.kubeflow:6789" -# Constants for Tune API. -STORAGE_INITIALIZER = "storage-initializer" # The default value for dataset and model storage PVC. PVC_DEFAULT_SIZE = "10Gi" # The default value for PVC access modes. PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"] - -INIT_CONTAINER_MOUNT_PATH = "/workspace" -VOLUME_PATH_DATASET = INIT_CONTAINER_MOUNT_PATH + "/dataset" -VOLUME_PATH_MODEL = INIT_CONTAINER_MOUNT_PATH + "/model" - -STORAGE_INITIALIZER_VOLUME_MOUNT = client.V1VolumeMount( - name=STORAGE_INITIALIZER, - mount_path=INIT_CONTAINER_MOUNT_PATH, -) - -STORAGE_INITIALIZER_VOLUME = client.V1Volume( - name=STORAGE_INITIALIZER, - persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( - claim_name=STORAGE_INITIALIZER - ), -) - -STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" - -TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" From acd1dcf07bc871eb2e2004f1f63cd90203e9dedb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 30 Jul 2024 10:37:34 +0800 Subject: [PATCH 14/53] keep 'parameters' and update validation Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 99 ++++++++++++------- 1 file changed, 64 insertions(+), 35 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 484ae04167a..64f5cc53aea 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -18,7 +18,7 @@ import multiprocessing import textwrap import time -from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union logger = logging.getLogger(__name__) @@ -168,8 +168,11 @@ def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - model_provider_parameters: Optional['HuggingFaceModelParams'] = None, - dataset_provider_parameters: Optional[Union['HuggingFaceDatasetParams', 'S3DatasetParams']] = None, + model_provider_parameters: Optional["HuggingFaceModelParams"] = None, + dataset_provider_parameters: Optional[ + Union["HuggingFaceDatasetParams", "S3DatasetParams"] + ] = None, + trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { "size": constants.PVC_DEFAULT_SIZE, "storage_class": None, @@ -177,7 +180,7 @@ def tune( }, objective: Optional[Callable] = None, base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, - trainer_parameters: Union['HuggingFaceTrainerParams', Dict[str, Any]]=None, + parameters: Optional[Dict[str, Any]] = None, namespace: Optional[str] = None, env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] @@ -200,8 +203,12 @@ def tune( metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: - - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. - - Custom objective function: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. `trainer_parameters` should be a dictionary to define the search space for these parameters. + 1. External models and datasets + Parameters: `model_provider_parameters` + `dataset_provider_parameters` + `trainer_parameters`. + Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + 2. Custom objective function + Parameters: `objective` + `base_image` + `parameters`. + Usage: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. The `parameters` should be a dictionary to define the search space for these parameters. Args: name: Name for the Experiment. @@ -209,6 +216,15 @@ def tune( For example, HuggingFace model name and Transformer type for that model, like: AutoModelForSequenceClassification. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. For example, name of the HuggingFace dataset or AWS S3 configuration. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams` + trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. It should be of type `HuggingFaceTrainerParams`. You should use the Katib SDK to define the search space for these parameters.For example: + ``` + trainer_parameters = HuggingFaceTrainerParams( + training_parameters = transformers.TrainingArguments( + learning_rate = katib.search.double(min=0.1, max=0.2), + ), + ), + ``` + Also, you can use these parameters to define input for training the models. storage_config: Configuration for Storage Initializer PVC to download pre-trained model and dataset. You can configure PVC size and storage class name in this argument. objective: Objective function that Katib uses to train the model. @@ -217,21 +233,11 @@ def tune( The function should not use any code declared outside of the function definition. Import statements must be added inside the function. base_image: Image to use when executing the objective function. - trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. - You should use the Katib SDK to define the search space for these parameters. - If you choose to use external models and datasets, it should be of type `HuggingFaceTrainerParams`. For example: - ``` - trainer_parameters = HuggingFaceTrainerParams( - training_parameters = transformers.TrainingArguments( - learning_rate = katib.search.double(min=0.1, max=0.2), - ), - ), - ``` - If you choose a custom objective function, it should be a dictionary. For example: + parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: ``` - trainer_parameters = {"lr": katib.search.double(min=0.1, max=0.2)} + parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` ``` - Also, you can use these parameters to define input for training the external models or your custom objective function. + Also, you can use these parameters to define input for your objective function. namespace: Namespace for the Experiment. env_per_trial: Environment variable(s) to be attached to each trial container. You can specify a dictionary as a mapping object representing the environment @@ -287,20 +293,29 @@ def tune( """ print( - "Thank you for using `tune` API for LLMs hyperparameters optimization. This feature is in alpha stage Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or Kubeflow Katib GitHub." + "Thank you for using the `tune` API for LLM hyperparameter optimization. " + "You can create a HyperParameter Optimization Katib Experiment using one of the following options:\n" + "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`.\n" + "2. Use custom objective function: specify `objective`, `base_image` and `parameters`.\n" + "This feature is in the alpha stage. The Kubeflow community is looking for your feedback. Please share your experience via the #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." ) if ( - (model_provider_parameters is not None) - and (dataset_provider_parameters is not None) - ) == (objective is not None): + model_provider_parameters is not None + or dataset_provider_parameters is not None + or trainer_parameters is not None + ) and ( + objective is not None or base_image is not None or parameters is not None + ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " - "You should only specify one of the following options: 1) `model_provider_parameters` and `dataset_provider_parameters`; 2) `objective`." + "You should only specify one of the following options:\n" + "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`;\n" + "2. Use custom objective function: specify `objective`, `base_image` and `parameters`." ) - if not name or not trainer_parameters: - raise ValueError("One of the required parameters is None") + if not name: + raise ValueError("Please specify name for the Experiment.") namespace = namespace or self.namespace @@ -378,7 +393,12 @@ def tune( ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( - metrics_format=["\\'(\\w+)\\':\\s((-?\\d+)(\\.\\d+)?)"] + metrics_format=[ + # For example: train_loss=0.846 + r"([\w|-]+)\s*=\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + # For example: 'train_loss':0.846 + r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + ] ) ), ) @@ -386,6 +406,9 @@ def tune( # Create Container and Pod specifications. # If users choose to use a custom objective function. if objective is not None: + if not base_image or not parameters: + raise ValueError("One of the required parameters is None.") + # Validate objective function. utils.validate_objective_function(objective) @@ -400,7 +423,7 @@ def tune( input_params = {} experiment_params = [] trial_params = [] - for p_name, p_value in trainer_parameters.items(): + for p_name, p_value in parameters.items(): # If input parameter value is Katib Experiment parameter sample. if isinstance(p_value, models.V1beta1ParameterSpec): # Wrap value for the function input. @@ -447,7 +470,7 @@ def tune( + exec_script ) - # create app container spec + # Create app container spec container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, @@ -470,8 +493,15 @@ def tune( # If users choose to use external models and datasets. else: + if ( + not model_provider_parameters + or not dataset_provider_parameters + or not trainer_parameters + ): + raise ValueError("One of the required parameters is None") + try: - from kubeflow.training.constants.constants import( + from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER, STORAGE_INITIALIZER_VOLUME_MOUNT, STORAGE_INITIALIZER_VOLUME, @@ -519,8 +549,7 @@ def tune( for pvc in pvc_list.items: if pvc.metadata.name == name: print( - f"PVC '{name}' already exists in namespace " - f"{namespace}." + f"PVC '{name}' already exists in namespace " f"{namespace}." ) break else: @@ -595,7 +624,7 @@ def tune( value = type(old_attr)(p_value) setattr(lora_config, p_name, value) - # create init container spec. + # Create init container spec. init_container_spec = client.V1Container( name=STORAGE_INITIALIZER, image=STORAGE_INITIALIZER_IMAGE, @@ -616,7 +645,7 @@ def tune( lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) - # create app container spec. + # Create app container spec. container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=TRAINER_TRANSFORMER_IMAGE, @@ -968,7 +997,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 6000, + timeout: int = 600, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): From 10b057df5cff36b913e9f68da10c246122398809 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 1 Aug 2024 07:01:56 +0800 Subject: [PATCH 15/53] update for test Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 260 +++++++++++++----- .../v1beta1/kubeflow/katib/types/__init__.py | 7 + .../kubeflow/katib/types/trainer_resources.py | 139 ++++++++++ sdk/python/v1beta1/test_llm.py | 63 +++++ 4 files changed, 397 insertions(+), 72 deletions(-) create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py create mode 100644 sdk/python/v1beta1/test_llm.py diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 64f5cc53aea..5a9b1ea5bff 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -24,6 +24,7 @@ import grpc from kubeflow.katib import models +from kubeflow.katib import types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -196,7 +197,7 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, - resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None, + resources_per_trial: Union[dict, client.V1ResourceRequirements, types.TrainerResources, None] = None, retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", @@ -357,14 +358,14 @@ def tune( experiment.spec.max_failed_trial_count = max_failed_trial_count # Add resources to the Katib Experiment. - if isinstance(resources_per_trial, dict): - if "gpu" in resources_per_trial: - resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") + #if isinstance(resources_per_trial, dict): + # if "gpu" in resources_per_trial: + # resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") - resources_per_trial = client.V1ResourceRequirements( - requests=resources_per_trial, - limits=resources_per_trial, - ) + # resources_per_trial = client.V1ResourceRequirements( + # requests=resources_per_trial, + # limits=resources_per_trial, + # ) # Add environment variables to the Katib Experiment. env = [] @@ -413,11 +414,11 @@ def tune( utils.validate_objective_function(objective) # Extract objective function implementation. - objective_code = inspect.getsource(objective) + #objective_code = inspect.getsource(objective) # Objective function might be defined in some indented scope # (e.g. in another function). We need to dedent the function code. - objective_code = textwrap.dedent(objective_code) + #objective_code = textwrap.dedent(objective_code) # Iterate over input parameters. input_params = {} @@ -445,51 +446,94 @@ def tune( # def objective(parameters): # print(f'Parameters are {parameters}') # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + #objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {objective_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - python3 -u $program_path/ephemeral_objective.py""" - ) + #exec_script = textwrap.dedent( + # """ + # program_path=$(mktemp -d) + # read -r -d '' SCRIPT << EOM\n + # {objective_code} + # EOM + # printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + # python3 -u $program_path/ephemeral_objective.py""" + #) # Add objective code to the execute script. - exec_script = exec_script.format(objective_code=objective_code) + #exec_script = exec_script.format(objective_code=objective_code) # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - utils.get_script_for_python_packages( - packages_to_install, pip_index_url - ) - + exec_script - ) - + #if packages_to_install is not None: + # exec_script = ( + # utils.get_script_for_python_packages( + # packages_to_install, pip_index_url + # ) + # + exec_script + # ) + + from kubeflow.training.utils import get_container_spec, get_pod_template_spec, get_pytorchjob_template # Create app container spec - container_spec = client.V1Container( + container_spec = get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, + train_func=objective, + train_func_parameters=input_params, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, + resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, + env=env, + env_from=env_from, ) + #container_spec = client.V1Container( + # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + # image=base_image, + # command=["bash", "-c"], + # args=[exec_script], + # env=env if env else None, + # env_from=env_from if env_from else None, + # resources=resources_per_trial, + #) + + if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): + pod_spec = get_pod_template_spec( + containers = [container_spec], + restart_policy="Never", + ) + #pod_spec = client.V1PodTemplateSpec( + # metadata=models.V1ObjectMeta( + # annotations={"sidecar.istio.io/inject": "false"} + # ), + # spec=client.V1PodSpec( + # restart_policy="Never", + # containers=[container_spec], + # ), + #) + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) + else: + worker_pod_spec = get_pod_template_spec( + containers = [container_spec], + restart_policy="Never", + ) - pod_spec = client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( + master_pod_spec = get_pod_template_spec( + containers = [container_spec], restart_policy="Never", - containers=[container_spec], - ), - ) + ) + trial_spec = get_pytorchjob_template( + name=name, + namespace=namespace, + master_pod_template_spec=master_pod_spec, + worker_pod_template_spec=worker_pod_spec, + num_workers=resources_per_trial.num_workers, + num_procs_per_worker=resources_per_trial.num_procs_per_worker, + ) # If users choose to use external models and datasets. else: @@ -504,7 +548,7 @@ def tune( from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER, STORAGE_INITIALIZER_VOLUME_MOUNT, - STORAGE_INITIALIZER_VOLUME, + #STORAGE_INITIALIZER_VOLUME, STORAGE_INITIALIZER_IMAGE, TRAINER_TRANSFORMER_IMAGE, ) @@ -625,16 +669,14 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. - init_container_spec = client.V1Container( + init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, - image=STORAGE_INITIALIZER_IMAGE, + base_image=STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, "--model_provider_parameters", - json.dumps( - model_provider_parameters.__dict__, cls=utils.SetEncoder - ), + json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -642,13 +684,30 @@ def tune( ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) + #init_container_spec = client.V1Container( + # name=STORAGE_INITIALIZER, + # image=STORAGE_INITIALIZER_IMAGE, + # args=[ + # "--model_provider", + # mp, + # "--model_provider_parameters", + # json.dumps( + # model_provider_parameters.__dict__, cls=utils.SetEncoder + # ), + # "--dataset_provider", + # dp, + # "--dataset_provider_parameters", + # json.dumps(dataset_provider_parameters.__dict__), + # ], + # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + #) lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) - # Create app container spec. - container_spec = client.V1Container( + + container_spec = get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=TRAINER_TRANSFORMER_IMAGE, + base_image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, @@ -664,31 +723,88 @@ def tune( f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, + resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, + env=env, + env_from=env_from, ) - - pod_spec = client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} + # Create app container spec. + #container_spec = client.V1Container( + # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + # image=TRAINER_TRANSFORMER_IMAGE, + # args=[ + # "--model_uri", + # model_provider_parameters.model_uri, + # "--transformer_type", + # model_provider_parameters.transformer_type.__name__, + # "--model_dir", + # VOLUME_PATH_MODEL, + # "--dataset_dir", + # VOLUME_PATH_DATASET, + # "--lora_config", + # f"'{lora_config}'", + # "--training_parameters", + # f"'{training_args}'", + # ], + # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + # env=env if env else None, + # env_from=env_from if env_from else None, + # resources=resources_per_trial, + #) + + storage_initializer_volume = models.V1Volume( + name=STORAGE_INITIALIZER, + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=name ), - spec=client.V1PodSpec( - restart_policy="Never", + ) + + if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): + pod_spec = get_pod_template_spec( containers=[container_spec], init_containers=[init_container_spec], - volumes=[STORAGE_INITIALIZER_VOLUME], - ), - ) + volumes=[storage_initializer_volume], + restart_policy="Never", + ) + #pod_spec = client.V1PodTemplateSpec( + # metadata=models.V1ObjectMeta( + # annotations={"sidecar.istio.io/inject": "false"} + # ), + # spec=client.V1PodSpec( + # restart_policy="Never", + # containers=[container_spec], + # init_containers=[init_container_spec], + # volumes=[STORAGE_INITIALIZER_VOLUME], + # ), + #) + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) + else: + # create worker pod spec + worker_pod_spec = get_pod_template_spec( + containers=[container_spec], + volumes=[storage_initializer_volume], + ) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) + # create master pod spec + master_pod_spec = get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], + ) + trial_spec = get_pytorchjob_template( + name=name, + namespace=namespace, + master_pod_template_spec=master_pod_spec, + worker_pod_template_spec=worker_pod_spec, + num_workers=resources_per_trial.num_workers, + num_procs_per_worker=resources_per_trial.num_procs_per_worker, + ) # Create Trial template. trial_template = models.V1beta1TrialTemplate( diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py new file mode 100644 index 00000000000..a38761478a0 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import + +# Import types into type package +from kubeflow.katib.types.trainer_resources import TrainerResources + +# Import Kubernetes models. +from kubernetes.client import * \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py new file mode 100644 index 00000000000..6ae7fa5741b --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -0,0 +1,139 @@ +import pprint +import re + +import six + +from kubeflow.katib.configuration import Configuration + + +class TrainerResources(object): + def __init__(self, num_workers=None, num_procs_per_worker=None, resources_per_worker=None, local_vars_configuration=None): + if local_vars_configuration is None: + local_vars_configuration = Configuration() + self.local_vars_configuration = local_vars_configuration + + self._num_workers = None + self._num_procs_per_worker = None + self._resources_per_worker = None + + if num_workers is not None: + self.num_workers = num_workers + if num_procs_per_worker is not None: + self.num_procs_per_worker = num_procs_per_worker + if resources_per_worker is not None: + self.resources_per_worker = resources_per_worker + + @property + def num_workers(self): + """Gets the number of workers of distributed training. + + Number of workers is setting number of workers. + + :return: The number of workers of distributed training. + :rtype: int + """ + return self._num_workers + + @num_workers.setter + def num_workers(self, num_workers): + """Sets the number of workers of distributed training. + + Number of workers is setting number of workers. + + :param num_workers: The number of workers of distributed training. + :type: int + """ + + self._num_workers = num_workers + + @property + def num_procs_per_worker(self): + """Gets the number of processes per worker of distributed training. + + Number of processes per worker is the setting number of processes per worker. + + :return: The number of processed per worker of distributed training. + :rtype: int + """ + return self._num_procs_per_worker + + @num_procs_per_worker.setter + def num_procs_per_worker(self, num_procs_per_worker): + """Sets the number of processes per worker of distributed training. + + Number of processes per worker is the setting number of processes per worker. + + :param num_procs_per_worker: The number of processes per worker of distributed training. + :type: int + """ + + self._num_procs_per_worker = num_procs_per_worker + + @property + def resources_per_worker(self): + """Gets the resources per worker of distributed training. + + Resources per worker is the setting resources per worker. + + :return: The resources per worker of distributed training. + :rtype: dict or V1ResourceRequirements + """ + return self._resources_per_worker + + @resources_per_worker.setter + def resources_per_worker(self, resources_per_worker): + """Sets the resources per worker of distributed training. + + Resources per worker is the setting resources per worker. + + :param resources_per_worker: The resources per worker of distributed training. + :type: dict or V1ResourceRequirements + """ + + self._resources_per_worker = resources_per_worker + + def to_dict(self): + """Returns the resources properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.__dict__): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, TrainerResources): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, TrainerResources): + return True + + return self.to_dict() != other.to_dict() diff --git a/sdk/python/v1beta1/test_llm.py b/sdk/python/v1beta1/test_llm.py new file mode 100644 index 00000000000..4bc81efb459 --- /dev/null +++ b/sdk/python/v1beta1/test_llm.py @@ -0,0 +1,63 @@ +import kubeflow.katib as katib +from kubeflow.katib import KatibClient + +import transformers +from peft import LoraConfig + +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceDatasetParams, + HuggingFaceTrainerParams, +) + +cl = KatibClient(namespace="kubeflow") + + +# [3] Create Katib Experiment with 12 Trials and 2 CPUs per Trial. +name = "llm-experiment" +cl.tune( + name = name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters = HuggingFaceModelParams( + model_uri = "hf://google-bert/bert-base-cased", + transformer_type = transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters = HuggingFaceDatasetParams( + repo_id = "yelp_review_full", + split = "train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters = HuggingFaceTrainerParams( + training_parameters = transformers.TrainingArguments( + output_dir = "test_tune_api", + save_strategy = "no", + learning_rate = katib.search.double(min=1e-05, max=5e-05), + #no_cuda=True, #if you use cpu instead of gpu + #use_cpu=True, #if you use cpu instead of gpu + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config = LoraConfig( + r = katib.search.int(min=8, max=32), + lora_alpha = 8, + lora_dropout = 0.1, + bias = "none", + ), + ), + objective_metric_name = "train_loss", + objective_type = "minimize", + algorithm_name = "random", + max_trial_count = 1, + parallel_trial_count = 1, + resources_per_trial={ + "cpu": "4", + "memory": "10G", + }, +) + +# [4] Wait until Katib Experiment is complete +cl.wait_for_experiment_condition(name=name) + +# [5] Get the best hyperparameters. +#print(cl.get_optimal_hyperparameters(name)) \ No newline at end of file From 5a87eb01be311baea7d62f56e176b992d33250d2 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 10:50:53 +0800 Subject: [PATCH 16/53] reuse 'get_container_spec' and 'get_pod_template_spec' from Training Operator Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 244 +++++------------- 1 file changed, 61 insertions(+), 183 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 5a9b1ea5bff..910d6b03d40 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -24,7 +24,6 @@ import grpc from kubeflow.katib import models -from kubeflow.katib import types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -197,7 +196,7 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, - resources_per_trial: Union[dict, client.V1ResourceRequirements, types.TrainerResources, None] = None, + resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None, retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", @@ -294,11 +293,7 @@ def tune( """ print( - "Thank you for using the `tune` API for LLM hyperparameter optimization. " - "You can create a HyperParameter Optimization Katib Experiment using one of the following options:\n" - "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`.\n" - "2. Use custom objective function: specify `objective`, `base_image` and `parameters`.\n" - "This feature is in the alpha stage. The Kubeflow community is looking for your feedback. Please share your experience via the #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." + "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." ) if ( @@ -306,7 +301,7 @@ def tune( or dataset_provider_parameters is not None or trainer_parameters is not None ) and ( - objective is not None or base_image is not None or parameters is not None + objective is not None or parameters is not None ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " @@ -358,14 +353,14 @@ def tune( experiment.spec.max_failed_trial_count = max_failed_trial_count # Add resources to the Katib Experiment. - #if isinstance(resources_per_trial, dict): - # if "gpu" in resources_per_trial: - # resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") + if isinstance(resources_per_trial, dict): + if "gpu" in resources_per_trial: + resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") - # resources_per_trial = client.V1ResourceRequirements( - # requests=resources_per_trial, - # limits=resources_per_trial, - # ) + resources_per_trial = client.V1ResourceRequirements( + requests=resources_per_trial, + limits=resources_per_trial, + ) # Add environment variables to the Katib Experiment. env = [] @@ -414,11 +409,11 @@ def tune( utils.validate_objective_function(objective) # Extract objective function implementation. - #objective_code = inspect.getsource(objective) + objective_code = inspect.getsource(objective) # Objective function might be defined in some indented scope # (e.g. in another function). We need to dedent the function code. - #objective_code = textwrap.dedent(objective_code) + objective_code = textwrap.dedent(objective_code) # Iterate over input parameters. input_params = {} @@ -446,94 +441,48 @@ def tune( # def objective(parameters): # print(f'Parameters are {parameters}') # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - #objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" # Prepare execute script template. - #exec_script = textwrap.dedent( - # """ - # program_path=$(mktemp -d) - # read -r -d '' SCRIPT << EOM\n - # {objective_code} - # EOM - # printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - # python3 -u $program_path/ephemeral_objective.py""" - #) + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {objective_code} + EOM + printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + python3 -u $program_path/ephemeral_objective.py""" + ) # Add objective code to the execute script. - #exec_script = exec_script.format(objective_code=objective_code) + exec_script = exec_script.format(objective_code=objective_code) # Install Python packages if that is required. - #if packages_to_install is not None: - # exec_script = ( - # utils.get_script_for_python_packages( - # packages_to_install, pip_index_url - # ) - # + exec_script - # ) - - from kubeflow.training.utils import get_container_spec, get_pod_template_spec, get_pytorchjob_template - # Create app container spec - container_spec = get_container_spec( + if packages_to_install is not None: + exec_script = ( + utils.get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) + + container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, - train_func=objective, - train_func_parameters=input_params, - packages_to_install=packages_to_install, - pip_index_url=pip_index_url, - resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, - env=env, - env_from=env_from, + command=["bash", "-c"], + args=[exec_script], + env=env if env else None, + env_from=env_from if env_from else None, + resources=resources_per_trial, ) - #container_spec = client.V1Container( - # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - # image=base_image, - # command=["bash", "-c"], - # args=[exec_script], - # env=env if env else None, - # env_from=env_from if env_from else None, - # resources=resources_per_trial, - #) - - if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): - pod_spec = get_pod_template_spec( - containers = [container_spec], - restart_policy="Never", - ) - #pod_spec = client.V1PodTemplateSpec( - # metadata=models.V1ObjectMeta( - # annotations={"sidecar.istio.io/inject": "false"} - # ), - # spec=client.V1PodSpec( - # restart_policy="Never", - # containers=[container_spec], - # ), - #) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) - else: - worker_pod_spec = get_pod_template_spec( - containers = [container_spec], - restart_policy="Never", - ) - master_pod_spec = get_pod_template_spec( - containers = [container_spec], + pod_spec = client.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=client.V1PodSpec( restart_policy="Never", - ) - trial_spec = get_pytorchjob_template( - name=name, - namespace=namespace, - master_pod_template_spec=master_pod_spec, - worker_pod_template_spec=worker_pod_spec, - num_workers=resources_per_trial.num_workers, - num_procs_per_worker=resources_per_trial.num_procs_per_worker, - ) + containers=[container_spec], + ), + ) # If users choose to use external models and datasets. else: @@ -548,7 +497,6 @@ def tune( from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER, STORAGE_INITIALIZER_VOLUME_MOUNT, - #STORAGE_INITIALIZER_VOLUME, STORAGE_INITIALIZER_IMAGE, TRAINER_TRANSFORMER_IMAGE, ) @@ -669,6 +617,8 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. + from kubeflow.training.utils.utils import get_container_spec, get_pod_template_spec + init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, base_image=STORAGE_INITIALIZER_IMAGE, @@ -684,23 +634,6 @@ def tune( ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) - #init_container_spec = client.V1Container( - # name=STORAGE_INITIALIZER, - # image=STORAGE_INITIALIZER_IMAGE, - # args=[ - # "--model_provider", - # mp, - # "--model_provider_parameters", - # json.dumps( - # model_provider_parameters.__dict__, cls=utils.SetEncoder - # ), - # "--dataset_provider", - # dp, - # "--dataset_provider_parameters", - # json.dumps(dataset_provider_parameters.__dict__), - # ], - # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - #) lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) @@ -723,33 +656,10 @@ def tune( f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, + resources=resources_per_trial, env=env, env_from=env_from, ) - # Create app container spec. - #container_spec = client.V1Container( - # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - # image=TRAINER_TRANSFORMER_IMAGE, - # args=[ - # "--model_uri", - # model_provider_parameters.model_uri, - # "--transformer_type", - # model_provider_parameters.transformer_type.__name__, - # "--model_dir", - # VOLUME_PATH_MODEL, - # "--dataset_dir", - # VOLUME_PATH_DATASET, - # "--lora_config", - # f"'{lora_config}'", - # "--training_parameters", - # f"'{training_args}'", - # ], - # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - # env=env if env else None, - # env_from=env_from if env_from else None, - # resources=resources_per_trial, - #) storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, @@ -758,53 +668,21 @@ def tune( ), ) - if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): - pod_spec = get_pod_template_spec( - containers=[container_spec], - init_containers=[init_container_spec], - volumes=[storage_initializer_volume], - restart_policy="Never", - ) - #pod_spec = client.V1PodTemplateSpec( - # metadata=models.V1ObjectMeta( - # annotations={"sidecar.istio.io/inject": "false"} - # ), - # spec=client.V1PodSpec( - # restart_policy="Never", - # containers=[container_spec], - # init_containers=[init_container_spec], - # volumes=[STORAGE_INITIALIZER_VOLUME], - # ), - #) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) - else: - # create worker pod spec - worker_pod_spec = get_pod_template_spec( - containers=[container_spec], - volumes=[storage_initializer_volume], - ) + pod_spec = get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], + restart_policy="Never", + ) - # create master pod spec - master_pod_spec = get_pod_template_spec( - containers=[container_spec], - init_containers=[init_container_spec], - volumes=[storage_initializer_volume], - ) - trial_spec = get_pytorchjob_template( - name=name, - namespace=namespace, - master_pod_template_spec=master_pod_spec, - worker_pod_template_spec=worker_pod_spec, - num_workers=resources_per_trial.num_workers, - num_procs_per_worker=resources_per_trial.num_procs_per_worker, - ) + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) # Create Trial template. trial_template = models.V1beta1TrialTemplate( @@ -1113,7 +991,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 600, + timeout: int = 6000, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): From 71605b469d1a5150ea3f260872844c6e34442cda Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:37:14 +0800 Subject: [PATCH 17/53] format with black Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 57624061ef3..e42778f2ad1 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -233,7 +233,7 @@ def tune( The function should not use any code declared outside of the function definition. Import statements must be added inside the function. base_image: Image to use when executing the objective function. - parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: + parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: ``` parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` ``` @@ -293,16 +293,16 @@ def tune( """ print( - "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." + "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. " + "Kubeflow community is looking for your feedback. Please share your experience via " + "#kubeflow-katib Slack channel or the Kubeflow Katib GitHub." ) if ( model_provider_parameters is not None or dataset_provider_parameters is not None or trainer_parameters is not None - ) and ( - objective is not None or parameters is not None - ): + ) and (objective is not None or parameters is not None): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " "You should only specify one of the following options:\n" @@ -382,15 +382,17 @@ def tune( ) # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ # For example: train_loss=0.846 - r"([\w|-]+)\s*=\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", - # For example: 'train_loss':0.846 + r"([\w|-]+)\s*=\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + # For example: 'train_loss':0.846 r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", ] ) @@ -458,10 +460,12 @@ def tune( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - utils.get_script_for_python_packages(packages_to_install, pip_index_url) + utils.get_script_for_python_packages( + packages_to_install, pip_index_url + ) + exec_script ) - + container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, @@ -480,7 +484,7 @@ def tune( restart_policy="Never", containers=[container_spec], ), - ) + ) # If users choose to use external models and datasets. else: @@ -615,7 +619,10 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. - from kubeflow.training.utils.utils import get_container_spec, get_pod_template_spec + from kubeflow.training.utils.utils import ( + get_container_spec, + get_pod_template_spec, + ) init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, @@ -624,7 +631,9 @@ def tune( "--model_provider", mp, "--model_provider_parameters", - json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + json.dumps( + model_provider_parameters.__dict__, cls=utils.SetEncoder + ), "--dataset_provider", dp, "--dataset_provider_parameters", From 35acedb95c2c6d5b1e5808c132157e151e025416 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:51:16 +0800 Subject: [PATCH 18/53] fix Lint error Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 43 ++++++------- .../v1beta1/kubeflow/katib/types/__init__.py | 2 +- .../kubeflow/katib/types/trainer_resources.py | 3 +- sdk/python/v1beta1/test_llm.py | 63 ------------------- 4 files changed, 22 insertions(+), 89 deletions(-) delete mode 100644 sdk/python/v1beta1/test_llm.py diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index e42778f2ad1..e106f452706 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -496,26 +496,25 @@ def tune( raise ValueError("One of the required parameters is None") try: - from kubeflow.training.constants.constants import ( - STORAGE_INITIALIZER, - STORAGE_INITIALIZER_VOLUME_MOUNT, - STORAGE_INITIALIZER_IMAGE, - TRAINER_TRANSFORMER_IMAGE, - ) - from kubeflow.storage_initializer.constants import ( - VOLUME_PATH_DATASET, - VOLUME_PATH_MODEL, - ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, - ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceTrainerParams, - ) + from kubeflow.storage_initializer.constants import \ + VOLUME_PATH_DATASET + from kubeflow.storage_initializer.constants import \ + VOLUME_PATH_MODEL + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceDatasetParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceModelParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceTrainerParams from kubeflow.storage_initializer.s3 import S3DatasetParams + from kubeflow.training.constants.constants import \ + STORAGE_INITIALIZER + from kubeflow.training.constants.constants import \ + STORAGE_INITIALIZER_IMAGE + from kubeflow.training.constants.constants import \ + STORAGE_INITIALIZER_VOLUME_MOUNT + from kubeflow.training.constants.constants import \ + TRAINER_TRANSFORMER_IMAGE import peft import transformers except ImportError: @@ -619,10 +618,8 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. - from kubeflow.training.utils.utils import ( - get_container_spec, - get_pod_template_spec, - ) + from kubeflow.training.utils.utils import get_container_spec + from kubeflow.training.utils.utils import get_pod_template_spec init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py index a38761478a0..a99fbea74b3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -4,4 +4,4 @@ from kubeflow.katib.types.trainer_resources import TrainerResources # Import Kubernetes models. -from kubernetes.client import * \ No newline at end of file +from kubernetes.client import * diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py index 6ae7fa5741b..54968af2081 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -1,9 +1,8 @@ import pprint import re -import six - from kubeflow.katib.configuration import Configuration +import six class TrainerResources(object): diff --git a/sdk/python/v1beta1/test_llm.py b/sdk/python/v1beta1/test_llm.py deleted file mode 100644 index 4bc81efb459..00000000000 --- a/sdk/python/v1beta1/test_llm.py +++ /dev/null @@ -1,63 +0,0 @@ -import kubeflow.katib as katib -from kubeflow.katib import KatibClient - -import transformers -from peft import LoraConfig - -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceDatasetParams, - HuggingFaceTrainerParams, -) - -cl = KatibClient(namespace="kubeflow") - - -# [3] Create Katib Experiment with 12 Trials and 2 CPUs per Trial. -name = "llm-experiment" -cl.tune( - name = name, - # BERT model URI and type of Transformer to train it. - model_provider_parameters = HuggingFaceModelParams( - model_uri = "hf://google-bert/bert-base-cased", - transformer_type = transformers.AutoModelForSequenceClassification, - ), - # Use 3000 samples from Yelp dataset. - dataset_provider_parameters = HuggingFaceDatasetParams( - repo_id = "yelp_review_full", - split = "train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters = HuggingFaceTrainerParams( - training_parameters = transformers.TrainingArguments( - output_dir = "test_tune_api", - save_strategy = "no", - learning_rate = katib.search.double(min=1e-05, max=5e-05), - #no_cuda=True, #if you use cpu instead of gpu - #use_cpu=True, #if you use cpu instead of gpu - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config = LoraConfig( - r = katib.search.int(min=8, max=32), - lora_alpha = 8, - lora_dropout = 0.1, - bias = "none", - ), - ), - objective_metric_name = "train_loss", - objective_type = "minimize", - algorithm_name = "random", - max_trial_count = 1, - parallel_trial_count = 1, - resources_per_trial={ - "cpu": "4", - "memory": "10G", - }, -) - -# [4] Wait until Katib Experiment is complete -cl.wait_for_experiment_condition(name=name) - -# [5] Get the best hyperparameters. -#print(cl.get_optimal_hyperparameters(name)) \ No newline at end of file From af534b36d12a4292a79a6a1b6cee7ca79c0fd171 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:53:21 +0800 Subject: [PATCH 19/53] fix Lint errors Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py index a99fbea74b3..46661f0cebb 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -2,6 +2,5 @@ # Import types into type package from kubeflow.katib.types.trainer_resources import TrainerResources - # Import Kubernetes models. from kubernetes.client import * From c7f6e10125413332ebd9835f9901dd7345bb445b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:55:32 +0800 Subject: [PATCH 20/53] delete types Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/types/__init__.py | 6 - .../kubeflow/katib/types/trainer_resources.py | 138 ------------------ 2 files changed, 144 deletions(-) delete mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py delete mode 100644 sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py deleted file mode 100644 index 46661f0cebb..00000000000 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import absolute_import - -# Import types into type package -from kubeflow.katib.types.trainer_resources import TrainerResources -# Import Kubernetes models. -from kubernetes.client import * diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py deleted file mode 100644 index 54968af2081..00000000000 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ /dev/null @@ -1,138 +0,0 @@ -import pprint -import re - -from kubeflow.katib.configuration import Configuration -import six - - -class TrainerResources(object): - def __init__(self, num_workers=None, num_procs_per_worker=None, resources_per_worker=None, local_vars_configuration=None): - if local_vars_configuration is None: - local_vars_configuration = Configuration() - self.local_vars_configuration = local_vars_configuration - - self._num_workers = None - self._num_procs_per_worker = None - self._resources_per_worker = None - - if num_workers is not None: - self.num_workers = num_workers - if num_procs_per_worker is not None: - self.num_procs_per_worker = num_procs_per_worker - if resources_per_worker is not None: - self.resources_per_worker = resources_per_worker - - @property - def num_workers(self): - """Gets the number of workers of distributed training. - - Number of workers is setting number of workers. - - :return: The number of workers of distributed training. - :rtype: int - """ - return self._num_workers - - @num_workers.setter - def num_workers(self, num_workers): - """Sets the number of workers of distributed training. - - Number of workers is setting number of workers. - - :param num_workers: The number of workers of distributed training. - :type: int - """ - - self._num_workers = num_workers - - @property - def num_procs_per_worker(self): - """Gets the number of processes per worker of distributed training. - - Number of processes per worker is the setting number of processes per worker. - - :return: The number of processed per worker of distributed training. - :rtype: int - """ - return self._num_procs_per_worker - - @num_procs_per_worker.setter - def num_procs_per_worker(self, num_procs_per_worker): - """Sets the number of processes per worker of distributed training. - - Number of processes per worker is the setting number of processes per worker. - - :param num_procs_per_worker: The number of processes per worker of distributed training. - :type: int - """ - - self._num_procs_per_worker = num_procs_per_worker - - @property - def resources_per_worker(self): - """Gets the resources per worker of distributed training. - - Resources per worker is the setting resources per worker. - - :return: The resources per worker of distributed training. - :rtype: dict or V1ResourceRequirements - """ - return self._resources_per_worker - - @resources_per_worker.setter - def resources_per_worker(self, resources_per_worker): - """Sets the resources per worker of distributed training. - - Resources per worker is the setting resources per worker. - - :param resources_per_worker: The resources per worker of distributed training. - :type: dict or V1ResourceRequirements - """ - - self._resources_per_worker = resources_per_worker - - def to_dict(self): - """Returns the resources properties as a dict""" - result = {} - - for attr, _ in six.iteritems(self.__dict__): - value = getattr(self, attr) - if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) - elif hasattr(value, "to_dict"): - result[attr] = value.to_dict() - elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) - else: - result[attr] = value - - return result - - def to_str(self): - """Returns the string representation of the model""" - return pprint.pformat(self.to_dict()) - - def __repr__(self): - """For `print` and `pprint`""" - return self.to_str() - - def __eq__(self, other): - """Returns true if both objects are equal""" - if not isinstance(other, TrainerResources): - return False - - return self.to_dict() == other.to_dict() - - def __ne__(self, other): - """Returns true if both objects are not equal""" - if not isinstance(other, TrainerResources): - return True - - return self.to_dict() != other.to_dict() From 9fdbdb72a99f8c436601bb900052a6bdc76a63c0 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 12:01:07 +0800 Subject: [PATCH 21/53] fix format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index e106f452706..4bb48478ba8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -995,7 +995,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 6000, + timeout: int = 600, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): @@ -1035,9 +1035,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Raise exception if Experiment is Failed. @@ -1057,9 +1055,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Running condition. @@ -1070,9 +1066,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Restarting condition. @@ -1083,9 +1077,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Succeeded condition. @@ -1096,9 +1088,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Otherwise, print the current Experiment results and sleep for the pooling interval. From ddd515319bd747ea7fad4e8251c132471ab2169e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 16:55:06 +0800 Subject: [PATCH 22/53] update format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 4bb48478ba8..3a1dfc8a9d4 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -200,7 +200,7 @@ def tune( retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", - metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, + metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: 1. External models and datasets From b31e820a825ca1fa06eae7488c04b451df430a05 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 16:56:47 +0800 Subject: [PATCH 23/53] update format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 3a1dfc8a9d4..4bb48478ba8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -200,7 +200,7 @@ def tune( retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", - metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, + metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: 1. External models and datasets From dad3831be2ec36683f3c980f2d747cfc1481d380 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 20:03:26 +0800 Subject: [PATCH 24/53] fix e2e test error Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 4bb48478ba8..453485623cb 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -384,9 +384,7 @@ def tune( # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec( - kind=metrics_collector_config["kind"] - ), + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ From 1afe56def822aee6145efe9fc271900fd8906a87 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 08:27:35 +0800 Subject: [PATCH 25/53] add TODO Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 453485623cb..6177efa093c 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -659,8 +659,7 @@ def tune( ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_trial, - env=env, - env_from=env_from, + # TODO (helenxie-bit): Add `env` and `env_from` in the future ) storage_initializer_volume = models.V1Volume( From ad7bce8c61b2463aab6810b4a0aa5c8abdb11743 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 08:57:00 +0800 Subject: [PATCH 26/53] format with max line length Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6177efa093c..14920253b3f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -382,7 +382,8 @@ def tune( ) # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + # Up to now, We only support parameter `kind`, of which default value is + # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), source=models.V1beta1SourceSpec( From 7e58c9470e1dd0ed963683cde357e9f00ac855c6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 09:59:53 +0800 Subject: [PATCH 27/53] format docstring Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 138 +++++++++++------- 1 file changed, 84 insertions(+), 54 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 14920253b3f..cffc1cdac20 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -202,21 +202,45 @@ def tune( pip_index_url: str = "https://pypi.org/simple", metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): - """Create HyperParameter Tuning Katib Experiment using one of the following options: + """ + Create HyperParameter Tuning Katib Experiment using one of the following + options: + 1. External models and datasets - Parameters: `model_provider_parameters` + `dataset_provider_parameters` + `trainer_parameters`. - Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + Parameters: `model_provider_parameters` + `dataset_provider_parameters` + + `trainer_parameters`. + Usage: Specify both `model_provider_parameters` and + `dataset_provider_parameters` to download models and datasets from external + platforms (currently supports HuggingFace and Amazon S3) using the Storage + Initializer. The `trainer_parameters` should be of type + `HuggingFaceTrainerParams` to set the hyperparameters search space. This API + will automatically define the "Trainer" class in HuggingFace with the provided + parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics + for optimizing hyperparameters. + 2. Custom objective function Parameters: `objective` + `base_image` + `parameters`. - Usage: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. The `parameters` should be a dictionary to define the search space for these parameters. + Usage: Specify the `objective` parameter to define your own objective function. + The `base_image` parameter will be used to execute the objective function. The + `parameters` should be a dictionary to define the search space for these + parameters. Args: name: Name for the Experiment. - model_provider_parameters: Parameters for the model provider in the Storage Initializer. - For example, HuggingFace model name and Transformer type for that model, like: AutoModelForSequenceClassification. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. - dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. - For example, name of the HuggingFace dataset or AWS S3 configuration. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams` - trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. It should be of type `HuggingFaceTrainerParams`. You should use the Katib SDK to define the search space for these parameters.For example: + model_provider_parameters: Parameters for the model provider in the Storage + Initializer. + For example, HuggingFace model name and Transformer type for that model, + like: AutoModelForSequenceClassification. This argument must be the type + of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. + dataset_provider_parameters: Parameters for the dataset provider in the + Storage Initializer. + For example, name of the HuggingFace dataset or AWS S3 configuration. + This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` + or `kubeflow.storage_initializer.s3.S3DatasetParams` + trainer_parameters: Parameters for configuring the training process, + including settings for the hyperparameters search space. It should be of + type `HuggingFaceTrainerParams`. You should use the Katib SDK to define + the search space for these parameters. For example: ``` trainer_parameters = HuggingFaceTrainerParams( training_parameters = transformers.TrainingArguments( @@ -224,19 +248,22 @@ def tune( ), ), ``` - Also, you can use these parameters to define input for training the models. - storage_config: Configuration for Storage Initializer PVC to download pre-trained model and dataset. - You can configure PVC size and storage class name in this argument. - objective: Objective function that Katib uses to train the model. - This function must be Callable and it must have only one dict argument. - Katib uses this argument to send HyperParameters to the function. - The function should not use any code declared outside of the function - definition. Import statements must be added inside the function. + Also, you can use these parameters to define input for training the + models. + storage_config: Configuration for Storage Initializer PVC to download + pre-trained model and dataset. You can configure PVC size and storage + class name in this argument. + objective: Objective function that Katib uses to train the model. This + function must be Callable and it must have only one dict argument. Katib + uses this argument to send HyperParameters to the function. The function + should not use any code declared outside of the function definition. + Import statements must be added inside the function. base_image: Image to use when executing the objective function. - parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: - ``` - parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` - ``` + parameters: Dict of HyperParameters to tune your Experiment if you choose a custom + objective function. You should use Katib SDK to define the search space for these + parameters. For example: + `parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` + Also, you can use these parameters to define input for your objective function. namespace: Namespace for the Experiment. env_per_trial: Environment variable(s) to be attached to each trial container. @@ -259,24 +286,24 @@ def tune( values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec. parallel_trial_count: Number of Trials that Experiment runs in parallel. max_failed_trial_count: Maximum number of Trials allowed to fail. - resources_per_trial: A parameter that lets you specify how much - resources each trial container should have. You can either specify a - kubernetes.client.V1ResourceRequirements object (documented here: - https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) - or a dictionary that includes one or more of the following keys: - `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate - values for these keys are documented here: - https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. - For example: + resources_per_trial: A parameter that lets you specify how much resources + each trial container should have. You can either specify a + kubernetes.client.V1ResourceRequirements object (documented here: + https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) + or a dictionary that includes one or more of the following keys: `cpu`, + `memory`, or `gpu` (other keys will be ignored). Appropriate values + for these keys are documented here: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. + For example: { "cpu": "1", "gpu": "1", "memory": "2Gi", } - Please note, `gpu` specifies a resource request with a key of - `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type - of GPU, pass in a V1ResourceRequirement instance instead, since it's - more flexible. This parameter is optional and defaults to None. + Please note, `gpu` specifies a resource request with a key of + `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of + GPU, pass in a V1ResourceRequirement instance instead, since it's more + flexible. This parameter is optional and defaults to None. retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state. packages_to_install: List of Python packages to install in addition to the base image packages. These packages are installed before @@ -293,9 +320,10 @@ def tune( """ print( - "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. " - "Kubeflow community is looking for your feedback. Please share your experience via " - "#kubeflow-katib Slack channel or the Kubeflow Katib GitHub." + "Thank you for using `tune` API for LLM hyperparameter optimization. This feature " + "is in the alpha stage. Kubeflow community is looking for your feedback. Please " + "share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib " + "GitHub." ) if ( @@ -304,10 +332,12 @@ def tune( or trainer_parameters is not None ) and (objective is not None or parameters is not None): raise ValueError( - "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " - "You should only specify one of the following options:\n" - "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`;\n" - "2. Use custom objective function: specify `objective`, `base_image` and `parameters`." + "Invalid configuration for creating a Katib Experiment for hyperparameter " + "optimization. You should only specify one of the following options:\n" + "1. Use external models and datasets: specify `model_provider_parameters`, " + "`dataset_provider_parameters` and `trainer_parameters`;\n" + "2. Use custom objective function: specify `objective`, `base_image` and " + "`parameters`." ) if not name: @@ -801,8 +831,8 @@ def get_experiment_conditions( experiment: models.V1beta1Experiment = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Experiment conditions. Experiment is in the condition when - `status` is True for the appropriate condition `type`. + """Get the Experiment conditions. Experiment is in the condition when `status` + is True for the appropriate condition `type`. Args: name: Name for the Experiment. @@ -997,8 +1027,8 @@ def wait_for_experiment_condition( polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): - """Wait until Experiment reaches specific condition. By default it waits - for the Succeeded condition. + """Wait until Experiment reaches specific condition. By default it waits for the + Succeeded condition. Args: name: Name for the Experiment. @@ -1109,9 +1139,9 @@ def edit_experiment_budget( max_failed_trial_count: int = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Update Experiment budget for the running Trials. You can modify Trial - budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` - resume policies. + """Update Experiment budget for the running Trials. You can modify Trial budget + to resume Succeeded Experiments with `LongRunning` and `FromVolume` resume + policies. Learn about resuming Experiments here: https://www.kubeflow.org/docs/components/katib/resume-experiment/ @@ -1350,8 +1380,8 @@ def list_trials( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """List of all Trials in namespace. If Experiment name is set, - it returns all Trials belong to the Experiment. + """List of all Trials in namespace. If Experiment name is set, it returns all + Trials belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1410,8 +1440,8 @@ def get_success_trial_details( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Succeeded Trial details. If Experiment name is set, - it returns Succeeded Trials details belong to the Experiment. + """Get the Succeeded Trial details. If Experiment name is set, it returns + Succeeded Trials details belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1519,8 +1549,8 @@ def get_trial_metrics( db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, timeout: str = constants.DEFAULT_TIMEOUT, ): - """Get the Trial Metric Results from the Katib DB. - Katib DB Manager service should be accessible while calling this API. + """Get the Trial Metric Results from the Katib DB. Katib DB Manager service + should be accessible while calling this API. If you run this API in-cluster (e.g. from the Kubeflow Notebook) you can use the default Katib DB Manager address: `katib-db-manager.kubeflow:6789`. From 61dc8ca1d9e8bec88c3ebc210c0e9b6b587f563a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 15:25:55 +0800 Subject: [PATCH 28/53] update format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index cffc1cdac20..27307e305c8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -214,8 +214,8 @@ def tune( platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API - will automatically define the "Trainer" class in HuggingFace with the provided - parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics + will automatically define the "Trainer" in HuggingFace with the provided + parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. 2. Custom objective function From ba0d7d173dd943236c8865c9047fc215c7b3e2f9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 07:23:34 +0800 Subject: [PATCH 29/53] add helper functions Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 106 ++++--------- .../v1beta1/kubeflow/katib/utils/utils.py | 142 +++++++++++++++++- 2 files changed, 168 insertions(+), 80 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 27307e305c8..1eebd3fe47f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -214,8 +214,8 @@ def tune( platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API - will automatically define the "Trainer" in HuggingFace with the provided - parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics + will automatically define the "Trainer" in HuggingFace with the provided + parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. 2. Custom objective function @@ -259,9 +259,9 @@ class name in this argument. should not use any code declared outside of the function definition. Import statements must be added inside the function. base_image: Image to use when executing the objective function. - parameters: Dict of HyperParameters to tune your Experiment if you choose a custom - objective function. You should use Katib SDK to define the search space for these - parameters. For example: + parameters: Dict of HyperParameters to tune your Experiment if you choose a custom + objective function. You should use Katib SDK to define the search space for these + parameters. For example: `parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` Also, you can use these parameters to define input for your objective function. @@ -286,12 +286,12 @@ class name in this argument. values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec. parallel_trial_count: Number of Trials that Experiment runs in parallel. max_failed_trial_count: Maximum number of Trials allowed to fail. - resources_per_trial: A parameter that lets you specify how much resources + resources_per_trial: A parameter that lets you specify how much resources each trial container should have. You can either specify a kubernetes.client.V1ResourceRequirements object (documented here: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) - or a dictionary that includes one or more of the following keys: `cpu`, - `memory`, or `gpu` (other keys will be ignored). Appropriate values + or a dictionary that includes one or more of the following keys: `cpu`, + `memory`, or `gpu` (other keys will be ignored). Appropriate values for these keys are documented here: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. For example: @@ -301,8 +301,8 @@ class name in this argument. "memory": "2Gi", } Please note, `gpu` specifies a resource request with a key of - `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of - GPU, pass in a V1ResourceRequirement instance instead, since it's more + `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of + GPU, pass in a V1ResourceRequirement instance instead, since it's more flexible. This parameter is optional and defaults to None. retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state. packages_to_install: List of Python packages to install in addition @@ -382,16 +382,6 @@ class name in this argument. if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - # Add resources to the Katib Experiment. - if isinstance(resources_per_trial, dict): - if "gpu" in resources_per_trial: - resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") - - resources_per_trial = client.V1ResourceRequirements( - requests=resources_per_trial, - limits=resources_per_trial, - ) - # Add environment variables to the Katib Experiment. env = [] env_from = [] @@ -415,7 +405,9 @@ class name in this argument. # Up to now, We only support parameter `kind`, of which default value is # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ @@ -437,13 +429,6 @@ class name in this argument. # Validate objective function. utils.validate_objective_function(objective) - # Extract objective function implementation. - objective_code = inspect.getsource(objective) - - # Objective function might be defined in some indented scope - # (e.g. in another function). We need to dedent the function code. - objective_code = textwrap.dedent(objective_code) - # Iterate over input parameters. input_params = {} experiment_params = [] @@ -466,53 +451,21 @@ class name in this argument. # Otherwise, add value to the function input. input_params[p_name] = p_value - # Wrap objective function to execute it from the file. For example - # def objective(parameters): - # print(f'Parameters are {parameters}') - # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" - - # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {objective_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - python3 -u $program_path/ephemeral_objective.py""" - ) - - # Add objective code to the execute script. - exec_script = exec_script.format(objective_code=objective_code) - - # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - utils.get_script_for_python_packages( - packages_to_install, pip_index_url - ) - + exec_script - ) - - container_spec = client.V1Container( + container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, + base_image=base_image, + train_func=objective, + train_func_parameters=input_params, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, resources=resources_per_trial, + env=env, + env_from=env_from, ) - pod_spec = client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( - restart_policy="Never", - containers=[container_spec], - ), + pod_spec = utils.get_pod_template_spec( + containers=[container_spec], + restart_policy="Never", ) # If users choose to use external models and datasets. @@ -646,11 +599,7 @@ class name in this argument. value = type(old_attr)(p_value) setattr(lora_config, p_name, value) - # Create init container spec. - from kubeflow.training.utils.utils import get_container_spec - from kubeflow.training.utils.utils import get_pod_template_spec - - init_container_spec = get_container_spec( + init_container_spec = utils.get_container_spec( name=STORAGE_INITIALIZER, base_image=STORAGE_INITIALIZER_IMAGE, args=[ @@ -671,7 +620,7 @@ class name in this argument. lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) - container_spec = get_container_spec( + container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, base_image=TRAINER_TRANSFORMER_IMAGE, args=[ @@ -690,7 +639,6 @@ class name in this argument. ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_trial, - # TODO (helenxie-bit): Add `env` and `env_from` in the future ) storage_initializer_volume = models.V1Volume( @@ -700,7 +648,7 @@ class name in this argument. ), ) - pod_spec = get_pod_template_spec( + pod_spec = utils.get_pod_template_spec( containers=[container_spec], init_containers=[init_container_spec], volumes=[storage_initializer_volume], diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 8c90a001d96..d3e8bc6c0e6 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -16,7 +16,7 @@ import json import os import textwrap -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from kubeflow.katib import models from kubeflow.katib.constants import constants @@ -131,6 +131,146 @@ def __init__(self, obj): self.data = json.dumps(obj) +def get_command_using_train_func( + train_func: Optional[Callable], + train_func_parameters: Optional[Dict[str, Any]] = None, + packages_to_install: Optional[List[str]] = None, + pip_index_url: str = "https://pypi.org/simple", +) -> Tuple[List[str], List[str]]: + """ + Get container args and command from the given training function and parameters. + """ + # Check if function is callable. + if not callable(train_func): + raise ValueError( + f"Training function must be callable, got function type: {type(train_func)}" + ) + + # Extract function implementation. + func_code = inspect.getsource(train_func) + + # Function might be defined in some indented scope (e.g. in another function). + # We need to dedent the function code. + func_code = textwrap.dedent(func_code) + + # Wrap function code to execute it from the file. For example: + # def train(parameters): + # print('Start Training...') + # train({'lr': 0.01}) + if train_func_parameters is None: + func_code = f"{func_code}\n{train_func.__name__}()\n" + else: + func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n" + + # Prepare execute script template. + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {func_code} + EOM + printf "%s" \"$SCRIPT\" > \"$program_path/ephemeral_script.py\" + python3 -u \"$program_path/ephemeral_script.py\"""" + ) + + # Add function code to the execute script. + exec_script = exec_script.format(func_code=func_code) + + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) + + # Return container command and args to execute training function. + return ["bash", "-c"], [exec_script] + + +def get_container_spec( + name: str, + base_image: str, + train_func: Optional[Callable] = None, + train_func_parameters: Optional[Dict[str, Any]] = None, + packages_to_install: Optional[List[str]] = None, + pip_index_url: str = "https://pypi.org/simple", + args: Optional[List[str]] = None, + resources: Union[dict, models.V1ResourceRequirements, None] = None, + volume_mounts: Optional[List[models.V1VolumeMount]] = None, + env: Optional[List[models.V1EnvVar]] = None, + env_from: Optional[List[models.V1EnvFromSource]] = None, +) -> models.V1Container: + """ + Get container spec for the given parameters. + """ + + if name is None or base_image is None: + raise ValueError("Container name or base image cannot be none") + + # Create initial container spec. + container_spec = models.V1Container( + name=name, image=base_image, args=args, volume_mounts=volume_mounts + ) + + # If training function is set, override container command and args to execute the function. + if train_func is not None: + container_spec.command, container_spec.args = get_command_using_train_func( + train_func=train_func, + train_func_parameters=train_func_parameters, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, + ) + + # Convert dict to the Kubernetes container resources if that is required. + if isinstance(resources, dict): + # Convert all keys in resources to lowercase. + resources = {k.lower(): v for k, v in resources.items()} + if "gpu" in resources: + resources["nvidia.com/gpu"] = resources.pop("gpu") + + resources = models.V1ResourceRequirements( + requests=resources, + limits=resources, + ) + + # Add resources to the container spec. + container_spec.resources = resources + + # Add environment variables to the container spec. + if env: + container_spec.env = env + if env_from: + container_spec.env_from = env_from + + + return container_spec + + +def get_pod_template_spec( + containers: List[models.V1Container], + init_containers: Optional[List[models.V1Container]] = None, + volumes: Optional[List[models.V1Volume]] = None, + restart_policy: Optional[str] = None, +) -> models.V1PodTemplateSpec: + """ + Get Pod template spec for the given parameters. + """ + + # Create Pod template spec. If the value is None, Pod doesn't have that parameter + pod_template_spec = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + init_containers=init_containers, + containers=containers, + volumes=volumes, + restart_policy=restart_policy, + ), + ) + + return pod_template_spec + def get_pvc_spec( pvc_name: str, namespace: str, From 2a1b0088b0f866f4371f1cea9383795f6ddeb6b6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 07:37:58 +0800 Subject: [PATCH 30/53] update format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 1eebd3fe47f..36de65cf7d9 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -779,8 +779,8 @@ def get_experiment_conditions( experiment: models.V1beta1Experiment = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Experiment conditions. Experiment is in the condition when `status` - is True for the appropriate condition `type`. + """Get the Experiment conditions. Experiment is in the condition when + `status` is True for the appropriate condition `type`. Args: name: Name for the Experiment. @@ -975,8 +975,8 @@ def wait_for_experiment_condition( polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): - """Wait until Experiment reaches specific condition. By default it waits for the - Succeeded condition. + """Wait until Experiment reaches specific condition. By default it waits + for the Succeeded condition. Args: name: Name for the Experiment. @@ -1087,9 +1087,9 @@ def edit_experiment_budget( max_failed_trial_count: int = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Update Experiment budget for the running Trials. You can modify Trial budget - to resume Succeeded Experiments with `LongRunning` and `FromVolume` resume - policies. + """Update Experiment budget for the running Trials. You can modify Trial + budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` + resume policies. Learn about resuming Experiments here: https://www.kubeflow.org/docs/components/katib/resume-experiment/ @@ -1328,8 +1328,8 @@ def list_trials( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """List of all Trials in namespace. If Experiment name is set, it returns all - Trials belong to the Experiment. + """List of all Trials in namespace. If Experiment name is set, + it returns all Trials belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1388,8 +1388,8 @@ def get_success_trial_details( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Succeeded Trial details. If Experiment name is set, it returns - Succeeded Trials details belong to the Experiment. + """Get the Succeeded Trial details. If Experiment name is set, + it returns Succeeded Trials details belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1497,8 +1497,8 @@ def get_trial_metrics( db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, timeout: str = constants.DEFAULT_TIMEOUT, ): - """Get the Trial Metric Results from the Katib DB. Katib DB Manager service - should be accessible while calling this API. + """Get the Trial Metric Results from the Katib DB. + Katib DB Manager service should be accessible while calling this API. If you run this API in-cluster (e.g. from the Kubeflow Notebook) you can use the default Katib DB Manager address: `katib-db-manager.kubeflow:6789`. From b3685214d76071314ce6594950b9d8a819c9b0e9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 07:56:33 +0800 Subject: [PATCH 31/53] update format Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 36de65cf7d9..a8282717fe3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -779,7 +779,7 @@ def get_experiment_conditions( experiment: models.V1beta1Experiment = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Experiment conditions. Experiment is in the condition when + """Get the Experiment conditions. Experiment is in the condition when `status` is True for the appropriate condition `type`. Args: @@ -975,7 +975,7 @@ def wait_for_experiment_condition( polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): - """Wait until Experiment reaches specific condition. By default it waits + """Wait until Experiment reaches specific condition. By default it waits for the Succeeded condition. Args: @@ -1087,8 +1087,8 @@ def edit_experiment_budget( max_failed_trial_count: int = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Update Experiment budget for the running Trials. You can modify Trial - budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` + """Update Experiment budget for the running Trials. You can modify Trial + budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` resume policies. Learn about resuming Experiments here: https://www.kubeflow.org/docs/components/katib/resume-experiment/ @@ -1328,7 +1328,7 @@ def list_trials( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """List of all Trials in namespace. If Experiment name is set, + """List of all Trials in namespace. If Experiment name is set, it returns all Trials belong to the Experiment. Args: @@ -1388,7 +1388,7 @@ def get_success_trial_details( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Succeeded Trial details. If Experiment name is set, + """Get the Succeeded Trial details. If Experiment name is set, it returns Succeeded Trials details belong to the Experiment. Args: @@ -1497,7 +1497,7 @@ def get_trial_metrics( db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, timeout: str = constants.DEFAULT_TIMEOUT, ): - """Get the Trial Metric Results from the Katib DB. + """Get the Trial Metric Results from the Katib DB. Katib DB Manager service should be accessible while calling this API. If you run this API in-cluster (e.g. from the Kubeflow Notebook) you can From 3ccbdf90af79d905cb22a79701759e0206d9251b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 15:00:36 +0800 Subject: [PATCH 32/53] run test again Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a8282717fe3..d00df9c26d5 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -211,7 +211,7 @@ def tune( `trainer_parameters`. Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external - platforms (currently supports HuggingFace and Amazon S3) using the Storage + platforms (currently support HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" in HuggingFace with the provided From 64e34e092e983f989c5ec006c0489b39f096a221 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 18:32:03 +0800 Subject: [PATCH 33/53] run test again Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d00df9c26d5..a8282717fe3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -211,7 +211,7 @@ def tune( `trainer_parameters`. Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external - platforms (currently support HuggingFace and Amazon S3) using the Storage + platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" in HuggingFace with the provided From dde724c6f111e37a574acbbc9b3611732ecbe9e9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 18:35:43 +0800 Subject: [PATCH 34/53] run test again Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a8282717fe3..d00df9c26d5 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -211,7 +211,7 @@ def tune( `trainer_parameters`. Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external - platforms (currently supports HuggingFace and Amazon S3) using the Storage + platforms (currently support HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" in HuggingFace with the provided From 1cccd4a54330a16b1837cdc87105b403041b1b18 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 14 Aug 2024 20:01:09 +0800 Subject: [PATCH 35/53] fix dict substitution in training_parameters Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d00df9c26d5..dce95878400 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import inspect import json import logging @@ -573,7 +574,11 @@ class name in this argument. elif p_value is not None: old_attr = getattr(training_args, p_name, None) if old_attr is not None: - value = type(old_attr)(p_value) + if isinstance(p_value, dict): + # Update the existing dictionary without nesting + value = copy.deepcopy(p_value) + else: + value = type(old_attr)(p_value) setattr(training_args, p_name, value) lora_config = trainer_parameters.lora_config From 510661d50c1d78f3e90775e71a511f6d97f4319e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 07:56:04 +0800 Subject: [PATCH 36/53] fix typo Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index d3e8bc6c0e6..91aabec6750 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -271,6 +271,7 @@ def get_pod_template_spec( return pod_template_spec + def get_pvc_spec( pvc_name: str, namespace: str, @@ -288,7 +289,7 @@ def get_pvc_spec( pvc_spec = models.V1PersistentVolumeClaim( api_version="v1", kind="PersistentVolumeClaim", - metadata={"name": pvc_name, "namepsace": namespace}, + metadata={"name": pvc_name, "namespace": namespace}, spec=models.V1PersistentVolumeClaimSpec( access_modes=storage_config["access_modes"], resources=models.V1ResourceRequirements( From f6b15a2b3d44461c79d155c1f09f76d6ed2c65c5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 10:53:26 +0800 Subject: [PATCH 37/53] resolve conflicts and add check for case of no parameters Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index b48950a3d60..c9c4ad370d5 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -333,10 +333,20 @@ class name in this argument. ) if ( - model_provider_parameters is not None - or dataset_provider_parameters is not None - or trainer_parameters is not None - ) and (objective is not None or parameters is not None): + ( + model_provider_parameters is not None + or dataset_provider_parameters is not None + or trainer_parameters is not None + ) + and (objective is not None or parameters is not None) + ) or ( + ( + model_provider_parameters is None + and dataset_provider_parameters is None + and trainer_parameters is None + ) + and (objective is None and parameters is None) + ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter " "optimization. You should only specify one of the following options:\n" @@ -1467,9 +1477,9 @@ def get_success_trial_details( ): output = {} output["name"] = trial.metadata.name - output["parameter_assignments"] = ( - trial.spec.parameter_assignments - ) + output[ + "parameter_assignments" + ] = trial.spec.parameter_assignments output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: From 6a3e046169019f0742f15a58a53f8c184a6a42d4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 11:01:04 +0800 Subject: [PATCH 38/53] fix format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 35 +++++++++---------- .../v1beta1/kubeflow/katib/utils/utils.py | 4 +-- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index c9c4ad370d5..86688136d6e 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -349,7 +349,7 @@ class name in this argument. ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter " - "optimization. You should only specify one of the following options:\n" + "optimization. You should specify one of the following options:\n" "1. Use external models and datasets: specify `model_provider_parameters`, " "`dataset_provider_parameters` and `trainer_parameters`;\n" "2. Use custom objective function: specify `objective`, `base_image` and " @@ -494,25 +494,22 @@ class name in this argument. raise ValueError("One of the required parameters is None") try: - from kubeflow.storage_initializer.constants import \ - VOLUME_PATH_DATASET - from kubeflow.storage_initializer.constants import \ - VOLUME_PATH_MODEL - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceDatasetParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceModelParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceTrainerParams + from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET + from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceTrainerParams, + ) from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.training.constants.constants import \ - STORAGE_INITIALIZER - from kubeflow.training.constants.constants import \ - STORAGE_INITIALIZER_IMAGE - from kubeflow.training.constants.constants import \ - STORAGE_INITIALIZER_VOLUME_MOUNT - from kubeflow.training.constants.constants import \ - TRAINER_TRANSFORMER_IMAGE + from kubeflow.training.constants.constants import STORAGE_INITIALIZER + from kubeflow.training.constants.constants import STORAGE_INITIALIZER_IMAGE + from kubeflow.training.constants.constants import STORAGE_INITIALIZER_VOLUME_MOUNT + from kubeflow.training.constants.constants import TRAINER_TRANSFORMER_IMAGE import peft import transformers except ImportError: diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 91aabec6750..77b10944f67 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -258,9 +258,7 @@ def get_pod_template_spec( # Create Pod template spec. If the value is None, Pod doesn't have that parameter pod_template_spec = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), + metadata=models.V1ObjectMeta(annotations={"sidecar.istio.io/inject": "false"}), spec=models.V1PodSpec( init_containers=init_containers, containers=containers, From 25541b92a6d20018d97f5c137d0741228c3402e3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 11:07:16 +0800 Subject: [PATCH 39/53] fix format Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 86688136d6e..3fa4de8e3af 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -507,9 +507,15 @@ class name in this argument. ) from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.training.constants.constants import STORAGE_INITIALIZER - from kubeflow.training.constants.constants import STORAGE_INITIALIZER_IMAGE - from kubeflow.training.constants.constants import STORAGE_INITIALIZER_VOLUME_MOUNT - from kubeflow.training.constants.constants import TRAINER_TRANSFORMER_IMAGE + from kubeflow.training.constants.constants import ( + STORAGE_INITIALIZER_IMAGE, + ) + from kubeflow.training.constants.constants import ( + STORAGE_INITIALIZER_VOLUME_MOUNT, + ) + from kubeflow.training.constants.constants import ( + TRAINER_TRANSFORMER_IMAGE, + ) import peft import transformers except ImportError: @@ -1474,9 +1480,9 @@ def get_success_trial_details( ): output = {} output["name"] = trial.metadata.name - output[ - "parameter_assignments" - ] = trial.spec.parameter_assignments + output["parameter_assignments"] = ( + trial.spec.parameter_assignments + ) output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: From 99e74d19a96c0205e8ac496861cc38b57f1fd22e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 11:10:47 +0800 Subject: [PATCH 40/53] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 77b10944f67..61c6b864f45 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -242,7 +242,6 @@ def get_container_spec( if env_from: container_spec.env_from = env_from - return container_spec From 96cf99c8733c549667c747345ea40fb20b1c5242 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:02:46 +0800 Subject: [PATCH 41/53] fix flake8 error Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 170 +++++------------- .../kubeflow/katib/constants/constants.py | 1 - .../v1beta1/kubeflow/katib/utils/utils.py | 8 +- 3 files changed, 47 insertions(+), 132 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 3fa4de8e3af..8591a01257d 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -13,24 +13,22 @@ # limitations under the License. import copy -import inspect import json import logging import multiprocessing -import textwrap import time -from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, List, Optional, Union logger = logging.getLogger(__name__) -import grpc +import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants -import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib.utils import utils -from kubernetes import client -from kubernetes import config +from kubernetes import client, config + +import grpc logger = logging.getLogger(__name__) @@ -136,18 +134,14 @@ def create_experiment( "name" ] # if "generate_name" is used, "name" gets a prefix from server except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to create Katib Experiment: {namespace}/{experiment_name}" - ) + raise TimeoutError(f"Timeout to create Katib Experiment: {namespace}/{experiment_name}") except Exception as e: if hasattr(e, "status") and e.status == 409: raise Exception( f"A Katib Experiment with the name " f"{namespace}/{experiment_name} already exists." ) - raise RuntimeError( - f"Failed to create Katib Experiment: {namespace}/{experiment_name}" - ) + raise RuntimeError(f"Failed to create Katib Experiment: {namespace}/{experiment_name}") logger.debug(f"Experiment {namespace}/{experiment_name} has been created") @@ -189,9 +183,7 @@ def tune( Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] ] = None, algorithm_name: str = "random", - algorithm_settings: Union[ - dict, List[models.V1beta1AlgorithmSetting], None - ] = None, + algorithm_settings: Union[dict, List[models.V1beta1AlgorithmSetting], None] = None, objective_metric_name: str = None, additional_metric_names: List[str] = [], objective_type: str = "maximize", @@ -402,10 +394,7 @@ class name in this argument. env = [] env_from = [] if isinstance(env_per_trial, dict): - env = [ - client.V1EnvVar(name=str(k), value=str(v)) - for k, v in env_per_trial.items() - ] + env = [client.V1EnvVar(name=str(k), value=str(v)) for k, v in env_per_trial.items()] elif env_per_trial: for x in env_per_trial: if isinstance(x, client.V1EnvVar): @@ -413,17 +402,13 @@ class name in this argument. elif isinstance(x, client.V1EnvFromSource): env_from.append(x) else: - raise ValueError( - f"Incorrect value for env_per_trial: {env_per_trial}" - ) + raise ValueError(f"Incorrect value for env_per_trial: {env_per_trial}") # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec( - kind=metrics_collector_config["kind"] - ), + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ @@ -494,30 +479,21 @@ class name in this argument. raise ValueError("One of the required parameters is None") try: - from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET - from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, + from kubeflow.storage_initializer.constants import ( + VOLUME_PATH_DATASET, + VOLUME_PATH_MODEL, ) from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, HuggingFaceModelParams, ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceTrainerParams, - ) from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.training.constants.constants import STORAGE_INITIALIZER from kubeflow.training.constants.constants import ( + STORAGE_INITIALIZER, STORAGE_INITIALIZER_IMAGE, - ) - from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER_VOLUME_MOUNT, - ) - from kubeflow.training.constants.constants import ( TRAINER_TRANSFORMER_IMAGE, ) - import peft - import transformers except ImportError: raise ImportError( "Tune API dependencies not installed. " @@ -536,15 +512,11 @@ class name in this argument. ), ) except Exception as e: - pvc_list = self.core_api.list_namespaced_persistent_volume_claim( - namespace - ) + pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == name: - print( - f"PVC '{name}' already exists in namespace " f"{namespace}." - ) + print(f"PVC '{name}' already exists in namespace " f"{namespace}.") break else: raise RuntimeError(f"failed to create PVC. Error: {e}") @@ -624,14 +596,12 @@ class name in this argument. init_container_spec = utils.get_container_spec( name=STORAGE_INITIALIZER, - base_image=STORAGE_INITIALIZER_IMAGE, + base_image="docker.io/helenxiehz428/test", #STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, "--model_provider_parameters", - json.dumps( - model_provider_parameters.__dict__, cls=utils.SetEncoder - ), + json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -645,7 +615,7 @@ class name in this argument. container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - base_image=TRAINER_TRANSFORMER_IMAGE, + base_image="docker.io/helenxiehz428/test_llm4", #TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, @@ -666,9 +636,7 @@ class name in this argument. storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( - claim_name=name - ), + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource(claim_name=name), ) pod_spec = utils.get_pod_template_spec( @@ -780,19 +748,13 @@ def list_experiments( ) response = thread.get(timeout) result = [ - self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Experiment - ) + self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Experiment) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Experiments in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Experiments in namespace: {namespace}") except Exception: - raise RuntimeError( - f"Failed to list Katib Experiments in namespace: {namespace}" - ) + raise RuntimeError(f"Failed to list Katib Experiments in namespace: {namespace}") return result def get_experiment_conditions( @@ -1029,20 +991,14 @@ def wait_for_experiment_condition( # Wait for Failed condition. if ( expected_condition == constants.EXPERIMENT_CONDITION_FAILED - and self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_failed(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Raise exception if Experiment is Failed. - elif self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout - ): + elif self.is_experiment_failed(name, namespace, experiment, apiserver_timeout): raise RuntimeError( f"Experiment: {namespace}/{name} is Failed. " f"Experiment conditions: {experiment.status.conditions}" @@ -1051,48 +1007,34 @@ def wait_for_experiment_condition( # Check if Experiment reaches Created condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_CREATED - and self.is_experiment_created( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_created(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Running condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RUNNING - and self.is_experiment_running( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_running(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Restarting condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING - and self.is_experiment_restarting( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_restarting(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Succeeded condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED - and self.is_experiment_succeeded( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_succeeded(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) @@ -1219,9 +1161,7 @@ def delete_experiment( body=delete_options, ) except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to delete Katib Experiment: {namespace}/{name}" - ) + raise TimeoutError(f"Timeout to delete Katib Experiment: {namespace}/{name}") except Exception: raise RuntimeError(f"Failed to delete Katib Experiment: {namespace}/{name}") @@ -1303,19 +1243,13 @@ def list_suggestions( ) response = thread.get(timeout) result = [ - self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Suggestion - ) + self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Suggestion) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Suggestions in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Suggestions in namespace: {namespace}") except Exception: - raise RuntimeError( - f"Failed to list Katib Suggestions in namespace: {namespace}" - ) + raise RuntimeError(f"Failed to list Katib Suggestions in namespace: {namespace}") return result def get_trial( @@ -1407,15 +1341,11 @@ def list_trials( ) response = thread.get(timeout) result = [ - self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Trial - ) + self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Trials in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result @@ -1467,28 +1397,18 @@ def get_success_trial_details( ) response = thread.get(timeout) for item in response.get("items"): - trial = self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Trial - ) - if ( - trial.status - and trial.status.conditions - and len(trial.status.conditions) > 0 - ): + trial = self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) + if trial.status and trial.status.conditions and len(trial.status.conditions) > 0: if utils.has_condition( trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED ): output = {} output["name"] = trial.metadata.name - output["parameter_assignments"] = ( - trial.spec.parameter_assignments - ) + output["parameter_assignments"] = trial.spec.parameter_assignments output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Trials in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index fa4e5882727..8e2620bc168 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -14,7 +14,6 @@ import os -from kubernetes import client # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 61c6b864f45..07601405de4 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -85,7 +85,6 @@ def validate_metrics_value(value: Any): def validate_objective_function(objective: Callable): - # Check if objective function is callable. if not callable(objective): raise ValueError( @@ -179,8 +178,7 @@ def get_command_using_train_func( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - get_script_for_python_packages(packages_to_install, pip_index_url) - + exec_script + get_script_for_python_packages(packages_to_install, pip_index_url) + exec_script ) # Return container command and args to execute training function. @@ -289,9 +287,7 @@ def get_pvc_spec( metadata={"name": pvc_name, "namespace": namespace}, spec=models.V1PersistentVolumeClaimSpec( access_modes=storage_config["access_modes"], - resources=models.V1ResourceRequirements( - requests={"storage": storage_config["size"]} - ), + resources=models.V1ResourceRequirements(requests={"storage": storage_config["size"]}), ), ) From c56880602796687c97eccc2b884e46c7b5b5e0dc Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:09:50 +0800 Subject: [PATCH 42/53] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 07601405de4..e743dae8816 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -178,7 +178,8 @@ def get_command_using_train_func( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - get_script_for_python_packages(packages_to_install, pip_index_url) + exec_script + get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script ) # Return container command and args to execute training function. @@ -287,7 +288,9 @@ def get_pvc_spec( metadata={"name": pvc_name, "namespace": namespace}, spec=models.V1PersistentVolumeClaimSpec( access_modes=storage_config["access_modes"], - resources=models.V1ResourceRequirements(requests={"storage": storage_config["size"]}), + resources=models.V1ResourceRequirements( + requests={"storage": storage_config["size"]} + ), ), ) From 6f65253ff951e3683b3d88fbc0e4a4e3969490ae Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:35:13 +0800 Subject: [PATCH 43/53] fix format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 175 +++++++++++++----- 1 file changed, 127 insertions(+), 48 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 8591a01257d..7cc10d7521a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -17,21 +17,26 @@ import logging import multiprocessing import time -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union -logger = logging.getLogger(__name__) - -import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants +import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib.utils import utils -from kubernetes import client, config +from kubernetes import client +from kubernetes import config import grpc logger = logging.getLogger(__name__) +if TYPE_CHECKING: + from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams + from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams + from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams + from kubeflow.storage_initializer.s3 import S3DatasetParams + class KatibClient(object): def __init__( @@ -134,14 +139,18 @@ def create_experiment( "name" ] # if "generate_name" is used, "name" gets a prefix from server except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to create Katib Experiment: {namespace}/{experiment_name}") + raise TimeoutError( + f"Timeout to create Katib Experiment: {namespace}/{experiment_name}" + ) except Exception as e: if hasattr(e, "status") and e.status == 409: raise Exception( f"A Katib Experiment with the name " f"{namespace}/{experiment_name} already exists." ) - raise RuntimeError(f"Failed to create Katib Experiment: {namespace}/{experiment_name}") + raise RuntimeError( + f"Failed to create Katib Experiment: {namespace}/{experiment_name}" + ) logger.debug(f"Experiment {namespace}/{experiment_name} has been created") @@ -183,7 +192,9 @@ def tune( Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] ] = None, algorithm_name: str = "random", - algorithm_settings: Union[dict, List[models.V1beta1AlgorithmSetting], None] = None, + algorithm_settings: Union[ + dict, List[models.V1beta1AlgorithmSetting], None + ] = None, objective_metric_name: str = None, additional_metric_names: List[str] = [], objective_type: str = "maximize", @@ -230,8 +241,8 @@ def tune( dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. For example, name of the HuggingFace dataset or AWS S3 configuration. - This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` - or `kubeflow.storage_initializer.s3.S3DatasetParams` + This argument must be the type of `kubeflow.storage_initializer.hugging_face. + HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams`. trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. It should be of type `HuggingFaceTrainerParams`. You should use the Katib SDK to define @@ -394,7 +405,10 @@ class name in this argument. env = [] env_from = [] if isinstance(env_per_trial, dict): - env = [client.V1EnvVar(name=str(k), value=str(v)) for k, v in env_per_trial.items()] + env = [ + client.V1EnvVar(name=str(k), value=str(v)) + for k, v in env_per_trial.items() + ] elif env_per_trial: for x in env_per_trial: if isinstance(x, client.V1EnvVar): @@ -402,13 +416,17 @@ class name in this argument. elif isinstance(x, client.V1EnvFromSource): env_from.append(x) else: - raise ValueError(f"Incorrect value for env_per_trial: {env_per_trial}") + raise ValueError( + f"Incorrect value for env_per_trial: {env_per_trial}" + ) # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ @@ -479,19 +497,23 @@ class name in this argument. raise ValueError("One of the required parameters is None") try: - from kubeflow.storage_initializer.constants import ( - VOLUME_PATH_DATASET, - VOLUME_PATH_MODEL, - ) + from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET + from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, + ) + from kubeflow.storage_initializer.hugging_face import ( HuggingFaceModelParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams + from kubeflow.training.constants.constants import STORAGE_INITIALIZER from kubeflow.training.constants.constants import ( - STORAGE_INITIALIZER, STORAGE_INITIALIZER_IMAGE, + ) + from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER_VOLUME_MOUNT, + ) + from kubeflow.training.constants.constants import ( TRAINER_TRANSFORMER_IMAGE, ) except ImportError: @@ -512,11 +534,15 @@ class name in this argument. ), ) except Exception as e: - pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) + pvc_list = self.core_api.list_namespaced_persistent_volume_claim( + namespace + ) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == name: - print(f"PVC '{name}' already exists in namespace " f"{namespace}.") + print( + f"PVC '{name}' already exists in namespace " f"{namespace}." + ) break else: raise RuntimeError(f"failed to create PVC. Error: {e}") @@ -534,7 +560,8 @@ class name in this argument. dp = "hf" else: raise ValueError( - "Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams." + "Dataset provider parameters must be an instance of S3DatasetParams " + "or HuggingFaceDatasetParams." ) # Iterate over input parameters. @@ -596,12 +623,14 @@ class name in this argument. init_container_spec = utils.get_container_spec( name=STORAGE_INITIALIZER, - base_image="docker.io/helenxiehz428/test", #STORAGE_INITIALIZER_IMAGE, + base_image=STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, "--model_provider_parameters", - json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + json.dumps( + model_provider_parameters.__dict__, cls=utils.SetEncoder + ), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -615,7 +644,7 @@ class name in this argument. container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - base_image="docker.io/helenxiehz428/test_llm4", #TRAINER_TRANSFORMER_IMAGE, + base_image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, @@ -636,7 +665,9 @@ class name in this argument. storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource(claim_name=name), + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=name + ), ) pod_spec = utils.get_pod_template_spec( @@ -748,13 +779,19 @@ def list_experiments( ) response = thread.get(timeout) result = [ - self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Experiment) + self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Experiment + ) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Experiments in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Experiments in namespace: {namespace}" + ) except Exception: - raise RuntimeError(f"Failed to list Katib Experiments in namespace: {namespace}") + raise RuntimeError( + f"Failed to list Katib Experiments in namespace: {namespace}" + ) return result def get_experiment_conditions( @@ -991,14 +1028,20 @@ def wait_for_experiment_condition( # Wait for Failed condition. if ( expected_condition == constants.EXPERIMENT_CONDITION_FAILED - and self.is_experiment_failed(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_failed( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Raise exception if Experiment is Failed. - elif self.is_experiment_failed(name, namespace, experiment, apiserver_timeout): + elif self.is_experiment_failed( + name, namespace, experiment, apiserver_timeout + ): raise RuntimeError( f"Experiment: {namespace}/{name} is Failed. " f"Experiment conditions: {experiment.status.conditions}" @@ -1007,34 +1050,48 @@ def wait_for_experiment_condition( # Check if Experiment reaches Created condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_CREATED - and self.is_experiment_created(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_created( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Running condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RUNNING - and self.is_experiment_running(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_running( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Restarting condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING - and self.is_experiment_restarting(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_restarting( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Succeeded condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED - and self.is_experiment_succeeded(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_succeeded( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) @@ -1161,7 +1218,9 @@ def delete_experiment( body=delete_options, ) except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to delete Katib Experiment: {namespace}/{name}") + raise TimeoutError( + f"Timeout to delete Katib Experiment: {namespace}/{name}" + ) except Exception: raise RuntimeError(f"Failed to delete Katib Experiment: {namespace}/{name}") @@ -1243,13 +1302,19 @@ def list_suggestions( ) response = thread.get(timeout) result = [ - self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Suggestion) + self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Suggestion + ) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Suggestions in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Suggestions in namespace: {namespace}" + ) except Exception: - raise RuntimeError(f"Failed to list Katib Suggestions in namespace: {namespace}") + raise RuntimeError( + f"Failed to list Katib Suggestions in namespace: {namespace}" + ) return result def get_trial( @@ -1341,11 +1406,15 @@ def list_trials( ) response = thread.get(timeout) result = [ - self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) + self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Trial + ) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Trials in namespace: {namespace}" + ) except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result @@ -1397,18 +1466,28 @@ def get_success_trial_details( ) response = thread.get(timeout) for item in response.get("items"): - trial = self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) - if trial.status and trial.status.conditions and len(trial.status.conditions) > 0: + trial = self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Trial + ) + if ( + trial.status + and trial.status.conditions + and len(trial.status.conditions) > 0 + ): if utils.has_condition( trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED ): output = {} output["name"] = trial.metadata.name - output["parameter_assignments"] = trial.spec.parameter_assignments + output["parameter_assignments"] = ( + trial.spec.parameter_assignments + ) output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Trials in namespace: {namespace}" + ) except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result From ad17ac9578a03c85c2a630774dd2e39f3488a931 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:42:47 +0800 Subject: [PATCH 44/53] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7cc10d7521a..a80fb15b9db 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -19,6 +19,7 @@ import time from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union +import grpc from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants @@ -27,8 +28,6 @@ from kubernetes import client from kubernetes import config -import grpc - logger = logging.getLogger(__name__) if TYPE_CHECKING: diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 8e2620bc168..1e0478f48f8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -14,7 +14,6 @@ import os - # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 From 9a1e2dfab2fa6aa8ad1645d3b270919b097b948f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:45:06 +0800 Subject: [PATCH 45/53] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a80fb15b9db..7cc10d7521a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -19,7 +19,6 @@ import time from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union -import grpc from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants @@ -28,6 +27,8 @@ from kubernetes import client from kubernetes import config +import grpc + logger = logging.getLogger(__name__) if TYPE_CHECKING: From 421aaa682ba958b5b72a88f32dd550b69a2e1c79 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 16:16:24 +0800 Subject: [PATCH 46/53] add pytorchjob for tune api Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 346 ++++++++++++++---- .../kubeflow/katib/constants/constants.py | 1 + 2 files changed, 273 insertions(+), 74 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7cc10d7521a..1e83caa2761 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -19,7 +19,7 @@ import time from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union -from kubeflow.katib import models +from kubeflow.katib import models, types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -202,7 +202,9 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, - resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None, + resources_per_trial: Union[ + dict, client.V1ResourceRequirements, types.TrainerResources, None + ] = None, retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", @@ -294,24 +296,50 @@ class name in this argument. https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec. parallel_trial_count: Number of Trials that Experiment runs in parallel. max_failed_trial_count: Maximum number of Trials allowed to fail. - resources_per_trial: A parameter that lets you specify how much resources - each trial container should have. You can either specify a + resources_per_trial: A parameter that lets you specify how much resources each + trial container should have. You can choose between non-distributed training + and distributed training. + 1) Non-distributed training: You can either specify a kubernetes.client.V1ResourceRequirements object (documented here: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) - or a dictionary that includes one or more of the following keys: `cpu`, - `memory`, or `gpu` (other keys will be ignored). Appropriate values - for these keys are documented here: + or a dictionary that includes one or more of the following keys: + `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate + values for these keys are documented here: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. For example: + ``` { "cpu": "1", "gpu": "1", "memory": "2Gi", } - Please note, `gpu` specifies a resource request with a key of - `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of - GPU, pass in a V1ResourceRequirement instance instead, since it's more - flexible. This parameter is optional and defaults to None. + ``` + Please note, `gpu` specifies a resource request with a key of `nvidia.com/gpu`, + i.e. an NVIDIA GPU. If you need a different type of GPU, pass in a + V1ResourceRequirement instance instead, since it's more flexible. This + parameter is optional and defaults to None. + 2) Distributed training: You can specify a types.TrainerResources object, which + includes `num_workers`, `num_procs_per_worker`, and `resources_per_worker`. + For example: + ``` + resources_per_trial = types.TrainerResources( + num_workers=4, + num_procs_per_worker=2, + resources_per_worker={ + "gpu": "2", + "cpu": "5", + "memory": "10Gi" + } + ) + ``` + - num_workers: Number of PyTorchJob workers. + - num_procs_per_worker: Number of processes per PyTorchJob worker for + `torchrun` CLI. You can use this parameter if you want to use more than 1 GPU + per PyTorchJob worker. + - resources_per_worker: A parameter that lets you specify how much resources + each PyTorchJob worker container should have. You can either specify + a kubernetes.client.V1ResourceRequirements object or a dictionary, same as + resources specified in non-distributed training. retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state. packages_to_install: List of Python packages to install in addition to the base image packages. These packages are installed before @@ -470,22 +498,109 @@ class name in this argument. # Otherwise, add value to the function input. input_params[p_name] = p_value - container_spec = utils.get_container_spec( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - base_image=base_image, - train_func=objective, - train_func_parameters=input_params, - packages_to_install=packages_to_install, - pip_index_url=pip_index_url, - resources=resources_per_trial, - env=env, - env_from=env_from, - ) + if isinstance(resources_per_trial, types.TrainerResources): + from kubeflow.training import models as training_models - pod_spec = utils.get_pod_template_spec( - containers=[container_spec], - restart_policy="Never", - ) + if ( + resources_per_trial.num_workers is None + or resources_per_trial.num_workers < 1 + ): + raise ValueError("At least one Worker for PyTorchJob must be set") + + # Create container spec. + container_spec = utils.get_container_spec( + name=constants.PYTORCHJOB_PRIMARY_CONTAINER_NAME, + base_image=base_image, + train_func=objective, + train_func_parameters=input_params, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, + resources=resources_per_trial.resources_per_worker, + env=env, + env_from=env_from, + ) + + # Create worker pod spec. + worker_pod_spec = utils.get_pod_template_spec( + containers=[container_spec], + ) + + # Create pytorchjob. + pytorchjob = training_models.KubeflowOrgV1PyTorchJob( + api_version="kubeflow.org/v1", + kind="PyTorchJob", + spec=training_models.KubeflowOrgV1PyTorchJobSpec( + run_policy=training_models.KubeflowOrgV1RunPolicy( + clean_pod_policy=None + ), + pytorch_replica_specs={}, + ), + ) + + if resources_per_trial.num_procs_per_worker: + pytorchjob.spec.nproc_per_node = str( + resources_per_trial.num_procs_per_worker + ) + + pytorchjob.spec.pytorch_replica_specs[ + "Master" + ] = training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=worker_pod_spec, + ) + + if resources_per_trial.num_workers > 1: + pytorchjob.spec.pytorch_replica_specs[ + "Worker" + ] = training_models.KubeflowOrgV1ReplicaSpec( + replicas=resources_per_trial.num_workers - 1, + template=worker_pod_spec, + ) + + # Create Trial template. + trial_template = models.V1beta1TrialTemplate( + primary_container_name=constants.PYTORCHJOB_PRIMARY_CONTAINER_NAME, + retain=retain_trials, + trial_parameters=trial_params, + trial_spec=pytorchjob, + ) + + else: + # Create container spec. + container_spec = utils.get_container_spec( + name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + base_image=base_image, + train_func=objective, + train_func_parameters=input_params, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, + resources=resources_per_trial, + env=env, + env_from=env_from, + ) + + # Create pod spec. + pod_spec = utils.get_pod_template_spec( + containers=[container_spec], + restart_policy="Never", + ) + + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) + + # Create Trial template. + trial_template = models.V1beta1TrialTemplate( + primary_container_name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + retain=retain_trials, + trial_parameters=trial_params, + trial_spec=trial_spec, + ) # If users choose to use external models and datasets. else: @@ -639,30 +754,6 @@ class name in this argument. volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) - lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) - training_args = json.dumps(training_args.to_dict()) - - container_spec = utils.get_container_spec( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - base_image=TRAINER_TRANSFORMER_IMAGE, - args=[ - "--model_uri", - model_provider_parameters.model_uri, - "--transformer_type", - model_provider_parameters.transformer_type.__name__, - "--model_dir", - VOLUME_PATH_MODEL, - "--dataset_dir", - VOLUME_PATH_DATASET, - "--lora_config", - f"'{lora_config}'", - "--training_parameters", - f"'{training_args}'", - ], - volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial, - ) - storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( @@ -670,29 +761,136 @@ class name in this argument. ), ) - pod_spec = utils.get_pod_template_spec( - containers=[container_spec], - init_containers=[init_container_spec], - volumes=[storage_initializer_volume], - restart_policy="Never", - ) + if isinstance(resources_per_trial, types.TrainerResources): + from kubeflow.training import models as training_models - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) + if ( + resources_per_trial.num_workers is None + or resources_per_trial.num_workers < 1 + ): + raise ValueError("At least one Worker for PyTorchJob must be set") + + # Create container spec. + container_spec = utils.get_container_spec( + name=constants.PYTORCHJOB_PRIMARY_CONTAINER_NAME, + base_image=TRAINER_TRANSFORMER_IMAGE, + args=[ + "--model_uri", + model_provider_parameters.model_uri, + "--transformer_type", + model_provider_parameters.transformer_type.__name__, + "--model_dir", + VOLUME_PATH_MODEL, + "--dataset_dir", + VOLUME_PATH_DATASET, + "--lora_config", + f"'{json.dumps(lora_config.__dict__, cls=utils.SetEncoder)}'", + "--training_parameters", + f"'{json.dumps(training_args.to_dict())}'", + ], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + resources=resources_per_trial.resources_per_worker, + ) - # Create Trial template. - trial_template = models.V1beta1TrialTemplate( - primary_container_name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - retain=retain_trials, - trial_parameters=trial_params, - trial_spec=trial_spec, - ) + # Create pod spec. + worker_pod_spec = utils.get_pod_template_spec( + containers=[container_spec], + volumes=[storage_initializer_volume], + ) + + master_pod_spec = utils.get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], + ) + + # Create pytorchjob. + pytorchjob = training_models.KubeflowOrgV1PyTorchJob( + api_version="kubeflow.org/v1", + kind="PyTorchJob", + spec=training_models.KubeflowOrgV1PyTorchJobSpec( + run_policy=training_models.KubeflowOrgV1RunPolicy( + clean_pod_policy=None + ), + pytorch_replica_specs={}, + ), + ) + + if resources_per_trial.num_procs_per_worker: + pytorchjob.spec.nproc_per_node = str( + resources_per_trial.num_procs_per_worker + ) + + pytorchjob.spec.pytorch_replica_specs[ + "Master" + ] = training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=master_pod_spec, + ) + + if resources_per_trial.num_workers > 1: + pytorchjob.spec.pytorch_replica_specs[ + "Worker" + ] = training_models.KubeflowOrgV1ReplicaSpec( + replicas=resources_per_trial.num_workers - 1, + template=worker_pod_spec, + ) + + # Create Trial template. + trial_template = models.V1beta1TrialTemplate( + primary_container_name=constants.PYTORCHJOB_PRIMARY_CONTAINER_NAME, + retain=retain_trials, + trial_parameters=trial_params, + trial_spec=pytorchjob, + ) + + else: + # Create container spec. + container_spec = utils.get_container_spec( + name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + base_image=TRAINER_TRANSFORMER_IMAGE, + args=[ + "--model_uri", + model_provider_parameters.model_uri, + "--transformer_type", + model_provider_parameters.transformer_type.__name__, + "--model_dir", + VOLUME_PATH_MODEL, + "--dataset_dir", + VOLUME_PATH_DATASET, + "--lora_config", + f"'{json.dumps(lora_config.__dict__, cls=utils.SetEncoder)}'", + "--training_parameters", + f"'{json.dumps(training_args.to_dict())}'", + ], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + resources=resources_per_trial, + ) + + # Create pod spec. + pod_spec = utils.get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], + restart_policy="Never", + ) + + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) + + # Create Trial template. + trial_template = models.V1beta1TrialTemplate( + primary_container_name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + retain=retain_trials, + trial_parameters=trial_params, + trial_spec=trial_spec, + ) # Add parameters to the Katib Experiment. experiment.spec.parameters = experiment_params @@ -1479,9 +1677,9 @@ def get_success_trial_details( ): output = {} output["name"] = trial.metadata.name - output["parameter_assignments"] = ( - trial.spec.parameter_assignments - ) + output[ + "parameter_assignments" + ] = trial.spec.parameter_assignments output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 1e0478f48f8..9de23a7bd61 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -33,6 +33,7 @@ DEFAULT_PRIMARY_CONTAINER_NAME = "training-container" +PYTORCHJOB_PRIMARY_CONTAINER_NAME = "pytorch" # Label to identify Experiment's resources. EXPERIMENT_LABEL = "katib.kubeflow.org/experiment" From bab4d92b341972e9533057e3278df7410ea2a6c8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 16:37:14 +0800 Subject: [PATCH 47/53] fix format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 1e83caa2761..44f84011b2c 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -19,7 +19,8 @@ import time from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union -from kubeflow.katib import models, types +from kubeflow.katib import models +from kubeflow.katib import types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -542,19 +543,19 @@ class name in this argument. resources_per_trial.num_procs_per_worker ) - pytorchjob.spec.pytorch_replica_specs[ - "Master" - ] = training_models.KubeflowOrgV1ReplicaSpec( - replicas=1, - template=worker_pod_spec, + pytorchjob.spec.pytorch_replica_specs["Master"] = ( + training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=worker_pod_spec, + ) ) if resources_per_trial.num_workers > 1: - pytorchjob.spec.pytorch_replica_specs[ - "Worker" - ] = training_models.KubeflowOrgV1ReplicaSpec( - replicas=resources_per_trial.num_workers - 1, - template=worker_pod_spec, + pytorchjob.spec.pytorch_replica_specs["Worker"] = ( + training_models.KubeflowOrgV1ReplicaSpec( + replicas=resources_per_trial.num_workers - 1, + template=worker_pod_spec, + ) ) # Create Trial template. @@ -821,19 +822,19 @@ class name in this argument. resources_per_trial.num_procs_per_worker ) - pytorchjob.spec.pytorch_replica_specs[ - "Master" - ] = training_models.KubeflowOrgV1ReplicaSpec( - replicas=1, - template=master_pod_spec, + pytorchjob.spec.pytorch_replica_specs["Master"] = ( + training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=master_pod_spec, + ) ) if resources_per_trial.num_workers > 1: - pytorchjob.spec.pytorch_replica_specs[ - "Worker" - ] = training_models.KubeflowOrgV1ReplicaSpec( - replicas=resources_per_trial.num_workers - 1, - template=worker_pod_spec, + pytorchjob.spec.pytorch_replica_specs["Worker"] = ( + training_models.KubeflowOrgV1ReplicaSpec( + replicas=resources_per_trial.num_workers - 1, + template=worker_pod_spec, + ) ) # Create Trial template. @@ -1677,9 +1678,9 @@ def get_success_trial_details( ): output = {} output["name"] = trial.metadata.name - output[ - "parameter_assignments" - ] = trial.spec.parameter_assignments + output["parameter_assignments"] = ( + trial.spec.parameter_assignments + ) output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: From f11051d8c561a15a0127c2ab00a00cf74f046810 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 16:44:35 +0800 Subject: [PATCH 48/53] add 'types' module Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/types/__init__.py | 7 + .../kubeflow/katib/types/trainer_resources.py | 145 ++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py new file mode 100644 index 00000000000..259f78db83a --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import + +# Import types into type package. +from kubeflow.katib.types.trainer_resources import TrainerResources + +# Import Kubernetes models. +from kubernetes.client import * \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py new file mode 100644 index 00000000000..b147b577e2a --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -0,0 +1,145 @@ +import pprint + +from kubeflow.katib.configuration import Configuration +import six + + +class TrainerResources(object): + def __init__( + self, + num_workers=None, + num_procs_per_worker=None, + resources_per_worker=None, + local_vars_configuration=None, + ): + if local_vars_configuration is None: + local_vars_configuration = Configuration() + self.local_vars_configuration = local_vars_configuration + + self._num_workers = None + self._num_procs_per_worker = None + self._resources_per_worker = None + + if num_workers is not None: + self.num_workers = num_workers + if num_procs_per_worker is not None: + self.num_procs_per_worker = num_procs_per_worker + if resources_per_worker is not None: + self.resources_per_worker = resources_per_worker + + @property + def num_workers(self): + """Gets the number of workers of distributed training. + + Number of workers is setting number of workers. + + :return: The number of workers of distributed training. + :rtype: int + """ + return self._num_workers + + @num_workers.setter + def num_workers(self, num_workers): + """Sets the number of workers of distributed training. + + Number of workers is setting number of workers. + + :param num_workers: The number of workers of distributed training. + :type: int + """ + + self._num_workers = num_workers + + @property + def num_procs_per_worker(self): + """Gets the number of processes per worker of distributed training. + + Number of processes per worker is the setting number of processes per worker. + + :return: The number of processed per worker of distributed training. + :rtype: int + """ + return self._num_procs_per_worker + + @num_procs_per_worker.setter + def num_procs_per_worker(self, num_procs_per_worker): + """Sets the number of processes per worker of distributed training. + + Number of processes per worker is the setting number of processes per worker. + + :param num_procs_per_worker: The number of processes per worker of distributed training. + :type: int + """ + + self._num_procs_per_worker = num_procs_per_worker + + @property + def resources_per_worker(self): + """Gets the resources per worker of distributed training. + + Resources per worker is the setting resources per worker. + + :return: The resources per worker of distributed training. + :rtype: dict or V1ResourceRequirements + """ + return self._resources_per_worker + + @resources_per_worker.setter + def resources_per_worker(self, resources_per_worker): + """Sets the resources per worker of distributed training. + + Resources per worker is the setting resources per worker. + + :param resources_per_worker: The resources per worker of distributed training. + :type: dict or V1ResourceRequirements + """ + + self._resources_per_worker = resources_per_worker + + def to_dict(self): + """Returns the resources properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.__dict__): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) + else: + result[attr] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, TrainerResources): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, TrainerResources): + return True + + return self.to_dict() != other.to_dict() From 96768bc7fae20d3ce40e9ef3d7ea85ab64c5dacf Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 17:20:45 +0800 Subject: [PATCH 49/53] add unit test for tune api Signed-off-by: helenxie-bit --- test/unit/v1beta1/tune-api/test_tune_api.py | 1285 +++++++++++++++++++ 1 file changed, 1285 insertions(+) create mode 100644 test/unit/v1beta1/tune-api/test_tune_api.py diff --git a/test/unit/v1beta1/tune-api/test_tune_api.py b/test/unit/v1beta1/tune-api/test_tune_api.py new file mode 100644 index 00000000000..af3795f9d74 --- /dev/null +++ b/test/unit/v1beta1/tune-api/test_tune_api.py @@ -0,0 +1,1285 @@ +import unittest +from unittest import TestCase +from unittest.mock import Mock, patch + +import kubeflow.katib as katib +from kubeflow.katib import KatibClient, models, types +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + HuggingFaceModelParams, + HuggingFaceTrainerParams, +) +from kubeflow.training import models as training_models +from kubernetes import client +from kubernetes.client.exceptions import ApiException +from peft import LoraConfig +import transformers + +class TestTuneAPI(TestCase): + # Create an instance of the KatibClient + def setUp(self): + self.katib_client = KatibClient(namespace="default") + + # Test input + # Test for missing required parameters + def test_tune_missing_name(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name=None, + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + ) + + self.assertIn("Please specify name for the Experiment.", str(context.exception)) + + # Test for invalid hyperparameter optimization configuration + # Case 1: Set two options: 1) external models and datasets; 2) custom objective at the same time + def test_tune_invalid_with_model_provider_and_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=Mock(), + objective=lambda x: x, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_dataset_provider_and_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + dataset_provider_parameters=Mock(), + objective=lambda x: x, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_trainer_parameters_and_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + trainer_parameters=Mock(), + objective=lambda x: x, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_model_provider_and_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=Mock(), + parameters={"lr": Mock()}, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_dataset_provider_and_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + dataset_provider_parameters=Mock(), + parameters={"lr": Mock()}, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_trainer_parameters_and_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + trainer_parameters=Mock(), + parameters={"lr": Mock()}, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + # Case 2: Missing parameters when choosing one option + def test_tune_invalid_with_only_model_provider(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=Mock(), + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_dataset_provider(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + dataset_provider_parameters=Mock(), + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_trainer_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + trainer_parameters=Mock(), + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + parameters={"lr": Mock()}, + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + # Case 3: No parameters provided + def test_tune_no_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune(name="experiment") + + self.assertIn("Invalid configuration", str(context.exception)) + + # Test for invalid parameters + # Case 1: Invalid env_per_trial + def test_tune_invalid_env_per_trial(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + env_per_trial=[123], # Invalid type + ) + + self.assertIn("Incorrect value for env_per_trial", str(context.exception)) + + # Case 2: Invalid resources_per_trial.num_workers (for distributed training) + def test_tune_invalid_resources_per_trial_value(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + resources_per_trial=types.TrainerResources( + num_workers=0, # Invalid value, should be at least 1 + num_procs_per_worker=1, + resources_per_worker={"cpu": "1", "memory": "1Gi"}, + ), + ) + + self.assertIn( + "At least one Worker for PyTorchJob must be set", str(context.exception) + ) + + # Case 3: Invalid model_provider_parameters + def test_tune_invalid_model_provider_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=123, # Invalid type, should be an instance of HuggingFaceModelParams + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + ) + + self.assertIn( + "Model provider parameters must be an instance of HuggingFaceModelParams", + str(context.exception), + ) + + # Case 4: Invalid dataset_provider_parameters + def test_tune_invalid_dataset_provider_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + dataset_provider_parameters=123, # Invalid type, should be an instance of HuggingFaceDatasetParameters or S3DatasetParams + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + ) + + self.assertIn( + "Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams", + str(context.exception), + ) + + # Case 5: Invalid trainer_parameters.training_parameters + def test_tune_invalid_trainer_parameters_training_parameters(self): + with self.assertRaises(TypeError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + not_a_valid_parameter="no", + ), + lora_config=LoraConfig(), + ), + ) + + self.assertIn( + "TrainingArguments.__init__() got an unexpected keyword argument", + str(context.exception), + ) + + # Case 6: Invalid trainer_parameters.lora_config + def test_tune_invalid_trainer_parameters_lora_config(self): + with self.assertRaises(TypeError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + ), + lora_config=LoraConfig( + not_a_valid_parameter="no", + ), + ), + ) + + self.assertIn( + "LoraConfig.__init__() got an unexpected keyword argument", + str(context.exception), + ) + + # Test functionality + # Test PVC creation + # Case 1: PVC successfully created + @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") + @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_pvc_creation(self, mock_create_experiment, mock_list_pvc, mock_create_pvc): + mock_create_pvc.return_value = Mock() + mock_list_pvc.return_value = Mock(items=[]) + mock_create_experiment.return_value = Mock() + + exp_name = "experiment" + storage_config = { + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + } + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + storage_config=storage_config, + ) + + expected_pvc_spec = models.V1PersistentVolumeClaim( + api_version="v1", + kind="PersistentVolumeClaim", + metadata={"name": exp_name, "namespace": "default"}, + spec=models.V1PersistentVolumeClaimSpec( + access_modes=storage_config["access_modes"], + resources=models.V1ResourceRequirements( + requests={"storage": storage_config["size"]} + ), + ), + ) + + mock_create_pvc.assert_called_once_with( + namespace="default", body=expected_pvc_spec + ) + + # Case 2: PVC already exists + @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") + @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_pvc_creation_with_existing_pvc( + self, mock_create_experiment, mock_list_pvc, mock_create_pvc + ): + # Simulate an ApiException being raised when trying to create a PVC + mock_create_pvc.side_effect = ApiException(status=409, reason="Already exists") + + # Simulate existing PVC in the list + mock_existing_pvc = Mock() + mock_existing_pvc.metadata.name = "test-pvc" + mock_list_pvc.return_value = Mock(items=[mock_existing_pvc]) + + mock_create_experiment.return_value = Mock() + + exp_name = "test-pvc" + storage_config = { + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + } + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + storage_config=storage_config, + ) + + # Assert that create_namespaced_persistent_volume_claim was called once + mock_create_pvc.assert_called_once() + + # Assert that list_namespaced_persistent_volume_claim was called to check existing PVCs + mock_list_pvc.assert_called_once_with("default") + + # Ensure no exception is raised since the PVC already exists + self.assertTrue(mock_list_pvc.return_value.items[0].metadata.name == exp_name) + + # Case 3: PVC creation fails + @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") + @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_pvc_creation_fails( + self, mock_create_experiment, mock_list_pvc, mock_create_pvc + ): + # Simulate an ApiException being raised when trying to create a PVC + mock_create_pvc.side_effect = ApiException( + status=500, reason="Internal Server Error" + ) + + # Simulate no existing PVC in the list + mock_list_pvc.return_value = Mock(items=[]) + + mock_create_experiment.return_value = Mock() + + exp_name = "test-pvc" + storage_config = { + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + } + with self.assertRaises(RuntimeError) as context: + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + storage_config=storage_config, + ) + + # Assert that the appropriate error message is raised + self.assertIn("failed to create PVC", str(context.exception)) + + # Assert that create_namespaced_persistent_volume_claim was called once + mock_create_pvc.assert_called_once() + + # Assert that list_namespaced_persistent_volume_claim was called once + mock_list_pvc.assert_called_once_with("default") + + # Test container, pod, job/pytorchjob, trial template, and experiment creation + # Case 1: Custom objective - distributed + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_experiment_creation_with_custom_objective_distributed( + self, mock_create_experiment + ): + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + objective_metric_name="accuracy", + objective_goal=0.9, + max_trial_count=10, + parallel_trial_count=2, + max_failed_trial_count=1, + resources_per_trial=types.TrainerResources( + num_workers=3, + num_procs_per_worker=1, + resources_per_worker={"cpu": "1", "memory": "1Gi"}, + ), + ) + + mock_create_experiment.assert_called_once() + args, kwargs = mock_create_experiment.call_args + experiment = args[0] + + expected_container = [ + models.V1Container( + name="pytorch", + image="docker.io/tensorflow/tensorflow:2.13.0", + command=["bash", "-c"], + args=[ + "\n" + "program_path=$(mktemp -d)\n" + "read -r -d '' SCRIPT << EOM\n" + "\n" + "objective=lambda x: x,\n" + "\n" + "({'a': '${trialParameters.a}', 'b': '${trialParameters.b}'})\n" + "\n" + "EOM\n" + 'printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py"\n' + 'python3 -u "$program_path/ephemeral_script.py"' + ], + resources=models.V1ResourceRequirements( + requests={"cpu": "1", "memory": "1Gi"}, + limits={"cpu": "1", "memory": "1Gi"}, + ), + ) + ] + + expected_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + containers=expected_container, + ), + ) + + expected_job = training_models.KubeflowOrgV1PyTorchJob( + api_version="kubeflow.org/v1", + kind="PyTorchJob", + spec=training_models.KubeflowOrgV1PyTorchJobSpec( + run_policy=training_models.KubeflowOrgV1RunPolicy( + clean_pod_policy=None + ), + pytorch_replica_specs={ + "Master": training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=expected_pod, + ), + "Worker": training_models.KubeflowOrgV1ReplicaSpec( + replicas=2, + template=expected_pod, + ), + }, + nproc_per_node="1", + ), + ) + + expected_trial_template = models.V1beta1TrialTemplate( + primary_container_name="pytorch", + trial_parameters=[ + models.V1beta1TrialParameterSpec(name="a", reference="a"), + models.V1beta1TrialParameterSpec(name="b", reference="b"), + ], + retain=False, + trial_spec=expected_job, + ) + + expected_parameters = [ + models.V1beta1ParameterSpec( + name="a", + parameter_type="int", + feasible_space=models.V1beta1FeasibleSpace(min="10", max="100"), + ), + models.V1beta1ParameterSpec( + name="b", + parameter_type="double", + feasible_space=models.V1beta1FeasibleSpace(min="0.1", max="0.2"), + ), + ] + + self.assertEqual(experiment.spec.objective.type, "maximize") + self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") + self.assertEqual(experiment.spec.objective.goal, 0.9) + self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") + self.assertEqual(experiment.spec.max_trial_count, 10) + self.assertEqual(experiment.spec.parallel_trial_count, 2) + self.assertEqual(experiment.spec.max_failed_trial_count, 1) + self.assertEqual(experiment.spec.parameters, expected_parameters) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].replicas, + 1, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template, + expected_pod, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].replicas, + 2, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].template, + expected_pod, + ) + self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) + self.assertEqual(experiment.spec.trial_template, expected_trial_template) + + # Case 2: Custom objective - non-distributed + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_experiment_creation_with_custom_objective_non_distributed( + self, mock_create_experiment + ): + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + objective_metric_name="accuracy", + objective_goal=0.9, + max_trial_count=10, + parallel_trial_count=2, + max_failed_trial_count=1, + resources_per_trial={"cpu": "1", "memory": "1Gi"}, + ) + + mock_create_experiment.assert_called_once() + args, kwargs = mock_create_experiment.call_args + experiment = args[0] + + expected_container = [ + models.V1Container( + name="training-container", + image="docker.io/tensorflow/tensorflow:2.13.0", + command=["bash", "-c"], + args=[ + "\n" + "program_path=$(mktemp -d)\n" + "read -r -d '' SCRIPT << EOM\n" + "\n" + "objective=lambda x: x,\n" + "\n" + "({'a': '${trialParameters.a}', 'b': '${trialParameters.b}'})\n" + "\n" + "EOM\n" + 'printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py"\n' + 'python3 -u "$program_path/ephemeral_script.py"' + ], + resources=models.V1ResourceRequirements( + requests={"cpu": "1", "memory": "1Gi"}, + limits={"cpu": "1", "memory": "1Gi"}, + ), + ) + ] + + expected_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + containers=expected_container, + restart_policy="Never", + ), + ) + + expected_job = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=expected_pod, + ), + ) + + expected_trial_template = models.V1beta1TrialTemplate( + primary_container_name="training-container", + trial_parameters=[ + models.V1beta1TrialParameterSpec(name="a", reference="a"), + models.V1beta1TrialParameterSpec(name="b", reference="b"), + ], + retain=False, + trial_spec=expected_job, + ) + + expected_parameters = [ + models.V1beta1ParameterSpec( + name="a", + parameter_type="int", + feasible_space=models.V1beta1FeasibleSpace(min="10", max="100"), + ), + models.V1beta1ParameterSpec( + name="b", + parameter_type="double", + feasible_space=models.V1beta1FeasibleSpace(min="0.1", max="0.2"), + ), + ] + + self.assertEqual(experiment.spec.objective.type, "maximize") + self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") + self.assertEqual(experiment.spec.objective.goal, 0.9) + self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") + self.assertEqual(experiment.spec.max_trial_count, 10) + self.assertEqual(experiment.spec.parallel_trial_count, 2) + self.assertEqual(experiment.spec.max_failed_trial_count, 1) + self.assertEqual(experiment.spec.parameters, expected_parameters) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.template, expected_pod + ) + self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) + self.assertEqual(experiment.spec.trial_template, expected_trial_template) + + # Case 3: External models and datasets - distributed + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_experiment_creation_with_external_distributed( + self, mock_create_experiment + ): + exp_name = "experiment" + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + objective_goal=0.9, + max_trial_count=10, + parallel_trial_count=2, + max_failed_trial_count=1, + resources_per_trial=types.TrainerResources( + num_workers=3, + num_procs_per_worker=1, + resources_per_worker={"cpu": "1", "memory": "1Gi"}, + ), + ) + + mock_create_experiment.assert_called_once() + args, kwargs = mock_create_experiment.call_args + experiment = args[0] + + expected_init_container = [ + models.V1Container( + name="storage-initializer", + image="docker.io/kubeflow/storage-initializer", + args=[ + "--model_provider", + "hf", + "--model_provider_parameters", + '{"model_uri": "hf://google-bert/bert-base-cased", "transformer_type": "AutoModelForSequenceClassification", ' + '"access_token": null}', + "--dataset_provider", + "hf", + "--dataset_provider_parameters", + '{"repo_id": "yelp_review_full", "access_token": null, "split": "train[:8]"}', + ], + volume_mounts=[ + training_models.V1VolumeMount( + name="storage-initializer", + mount_path="/workspace", + ) + ], + ) + ] + + expected_container = [ + models.V1Container( + name="pytorch", + image="docker.io/kubeflow/trainer-huggingface", + args=[ + "--model_uri", + "hf://google-bert/bert-base-cased", + "--transformer_type", + "AutoModelForSequenceClassification", + "--model_dir", + "/workspace/model", + "--dataset_dir", + "/workspace/dataset", + "--lora_config", + '\'{"peft_type": "LORA", "base_model_name_or_path": null, "task_type": null, ' + '"inference_mode": false, "r": "${trialParameters.r}", "target_modules": null, ' + '"lora_alpha": 8, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", ' + '"modules_to_save": null, "init_lora_weights": true}\'', + "--training_parameters", + '\'{"output_dir": "test_tune_api", "overwrite_output_dir": false, "do_train": ' + 'false, "do_eval": false, "do_predict": false, "evaluation_strategy": "no", ' + '"prediction_loss_only": false, "per_device_train_batch_size": 8, ' + '"per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, ' + '"per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 1, ' + '"eval_accumulation_steps": null, "eval_delay": 0, "learning_rate": ' + '"${trialParameters.learning_rate}", "weight_decay": 0.0, "adam_beta1": 0.9, ' + '"adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, ' + '"num_train_epochs": 1, "max_steps": -1, "lr_scheduler_type": "linear", ' + '"lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, ' + '"log_level": "passive", "log_level_replica": "warning", "log_on_each_node": ' + 'true, "logging_dir": "test_tune_api/logs", "logging_strategy": "steps", ' + '"logging_first_step": false, "logging_steps": 500, "logging_nan_inf_filter": ' + 'true, "save_strategy": "no", "save_steps": 500, "save_total_limit": null, ' + '"save_safetensors": true, "save_on_each_node": false, "save_only_model": ' + 'false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": ' + '42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": ' + 'false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": ' + '"auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, ' + '"local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, ' + '"tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, ' + '"eval_steps": null, "dataloader_num_workers": 0, "dataloader_prefetch_factor": ' + 'null, "past_index": -1, "run_name": "test_tune_api", "disable_tqdm": false, ' + '"remove_unused_columns": true, "label_names": null, "load_best_model_at_end": ' + 'false, "metric_for_best_model": null, "greater_is_better": null, ' + '"ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, ' + '"fsdp_config": {"min_num_params": 0, "xla": false, "xla_fsdp_v2": false, ' + '"xla_fsdp_grad_ckpt": false}, "fsdp_transformer_layer_cls_to_wrap": null, ' + '"accelerator_config": {"split_batches": false, "dispatch_batches": null, ' + '"even_batches": true, "use_seedable_sampler": true}, "deepspeed": null, ' + '"label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, ' + '"adafactor": false, "group_by_length": false, "length_column_name": "length", ' + '"report_to": ["tensorboard"], "ddp_find_unused_parameters": null, ' + '"ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, ' + '"dataloader_pin_memory": true, "dataloader_persistent_workers": false, ' + '"skip_memory_metrics": true, "use_legacy_prediction_loop": false, ' + '"push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, ' + '"hub_strategy": "every_save", "hub_token": "", "hub_private_repo": ' + 'false, "hub_always_push": false, "gradient_checkpointing": false, ' + '"gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, ' + '"fp16_backend": "auto", "push_to_hub_model_id": null, ' + '"push_to_hub_organization": null, "push_to_hub_token": "", ' + '"mp_parameters": "", "auto_find_batch_size": false, "full_determinism": ' + 'false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, ' + '"torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": ' + 'null, "dispatch_batches": null, "split_batches": null, "include_tokens_per_' + 'second": false, "include_num_input_tokens_seen": false, ' + '"neftune_noise_alpha": null}\'', + ], + resources=models.V1ResourceRequirements( + requests={"cpu": "1", "memory": "1Gi"}, + limits={"cpu": "1", "memory": "1Gi"}, + ), + volume_mounts=[ + training_models.V1VolumeMount( + name="storage-initializer", + mount_path="/workspace", + ) + ], + ) + ] + + expected_master_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + init_containers=expected_init_container, + containers=expected_container, + volumes=[ + models.V1Volume( + name="storage-initializer", + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=exp_name + ), + ) + ], + ), + ) + + expected_worker_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + containers=expected_container, + volumes=[ + models.V1Volume( + name="storage-initializer", + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=exp_name + ), + ) + ], + ), + ) + + expected_job = training_models.KubeflowOrgV1PyTorchJob( + api_version="kubeflow.org/v1", + kind="PyTorchJob", + spec=training_models.KubeflowOrgV1PyTorchJobSpec( + run_policy=training_models.KubeflowOrgV1RunPolicy( + clean_pod_policy=None + ), + pytorch_replica_specs={ + "Master": training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=expected_master_pod, + ), + "Worker": training_models.KubeflowOrgV1ReplicaSpec( + replicas=2, + template=expected_worker_pod, + ), + }, + nproc_per_node="1", + ), + ) + + expected_trial_template = models.V1beta1TrialTemplate( + primary_container_name="pytorch", + trial_parameters=[ + models.V1beta1TrialParameterSpec( + name="learning_rate", reference="learning_rate" + ), + models.V1beta1TrialParameterSpec(name="r", reference="r"), + ], + retain=False, + trial_spec=expected_job, + ) + + expected_parameters = [ + models.V1beta1ParameterSpec( + name="learning_rate", + parameter_type="double", + feasible_space=models.V1beta1FeasibleSpace(min="1e-05", max="5e-05"), + ), + models.V1beta1ParameterSpec( + name="r", + parameter_type="int", + feasible_space=models.V1beta1FeasibleSpace(min="8", max="32"), + ), + ] + + self.assertEqual(experiment.spec.objective.type, "maximize") + self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") + self.assertEqual(experiment.spec.objective.goal, 0.9) + self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") + self.assertEqual(experiment.spec.max_trial_count, 10) + self.assertEqual(experiment.spec.parallel_trial_count, 2) + self.assertEqual(experiment.spec.max_failed_trial_count, 1) + self.assertEqual(experiment.spec.parameters, expected_parameters) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template.spec.init_containers, + expected_init_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].replicas, + 1, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template, + expected_master_pod, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].replicas, + 2, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].template, + expected_worker_pod, + ) + self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) + self.assertEqual(experiment.spec.trial_template, expected_trial_template) + + # Case 4: External models and datasets - non-distributed + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_experiment_creation_with_external_non_distributed( + self, mock_create_experiment + ): + exp_name = "experiment" + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + objective_goal=0.9, + max_trial_count=10, + parallel_trial_count=2, + max_failed_trial_count=1, + resources_per_trial={"cpu": "1", "memory": "1Gi"}, + ) + + mock_create_experiment.assert_called_once() + args, kwargs = mock_create_experiment.call_args + experiment = args[0] + + expected_init_container = [ + models.V1Container( + name="storage-initializer", + image="docker.io/kubeflow/storage-initializer", + args=[ + "--model_provider", + "hf", + "--model_provider_parameters", + '{"model_uri": "hf://google-bert/bert-base-cased", "transformer_type": "AutoModelForSequenceClassification", ' + '"access_token": null}', + "--dataset_provider", + "hf", + "--dataset_provider_parameters", + '{"repo_id": "yelp_review_full", "access_token": null, "split": "train[:8]"}', + ], + volume_mounts=[ + training_models.V1VolumeMount( + name="storage-initializer", + mount_path="/workspace", + ) + ], + ) + ] + + expected_container = [ + models.V1Container( + name="training-container", + image="docker.io/kubeflow/trainer-huggingface", + args=[ + "--model_uri", + "hf://google-bert/bert-base-cased", + "--transformer_type", + "AutoModelForSequenceClassification", + "--model_dir", + "/workspace/model", + "--dataset_dir", + "/workspace/dataset", + "--lora_config", + '\'{"peft_type": "LORA", "base_model_name_or_path": null, "task_type": null, ' + '"inference_mode": false, "r": "${trialParameters.r}", "target_modules": null, ' + '"lora_alpha": 8, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", ' + '"modules_to_save": null, "init_lora_weights": true}\'', + "--training_parameters", + '\'{"output_dir": "test_tune_api", "overwrite_output_dir": false, "do_train": ' + 'false, "do_eval": false, "do_predict": false, "evaluation_strategy": "no", ' + '"prediction_loss_only": false, "per_device_train_batch_size": 8, ' + '"per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, ' + '"per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 1, ' + '"eval_accumulation_steps": null, "eval_delay": 0, "learning_rate": ' + '"${trialParameters.learning_rate}", "weight_decay": 0.0, "adam_beta1": 0.9, ' + '"adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, ' + '"num_train_epochs": 1, "max_steps": -1, "lr_scheduler_type": "linear", ' + '"lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, ' + '"log_level": "passive", "log_level_replica": "warning", "log_on_each_node": ' + 'true, "logging_dir": "test_tune_api/logs", "logging_strategy": "steps", ' + '"logging_first_step": false, "logging_steps": 500, "logging_nan_inf_filter": ' + 'true, "save_strategy": "no", "save_steps": 500, "save_total_limit": null, ' + '"save_safetensors": true, "save_on_each_node": false, "save_only_model": ' + 'false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": ' + '42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": ' + 'false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": ' + '"auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, ' + '"local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, ' + '"tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, ' + '"eval_steps": null, "dataloader_num_workers": 0, "dataloader_prefetch_factor": ' + 'null, "past_index": -1, "run_name": "test_tune_api", "disable_tqdm": false, ' + '"remove_unused_columns": true, "label_names": null, "load_best_model_at_end": ' + 'false, "metric_for_best_model": null, "greater_is_better": null, ' + '"ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, ' + '"fsdp_config": {"min_num_params": 0, "xla": false, "xla_fsdp_v2": false, ' + '"xla_fsdp_grad_ckpt": false}, "fsdp_transformer_layer_cls_to_wrap": null, ' + '"accelerator_config": {"split_batches": false, "dispatch_batches": null, ' + '"even_batches": true, "use_seedable_sampler": true}, "deepspeed": null, ' + '"label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, ' + '"adafactor": false, "group_by_length": false, "length_column_name": "length", ' + '"report_to": ["tensorboard"], "ddp_find_unused_parameters": null, ' + '"ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, ' + '"dataloader_pin_memory": true, "dataloader_persistent_workers": false, ' + '"skip_memory_metrics": true, "use_legacy_prediction_loop": false, ' + '"push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, ' + '"hub_strategy": "every_save", "hub_token": "", "hub_private_repo": ' + 'false, "hub_always_push": false, "gradient_checkpointing": false, ' + '"gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, ' + '"fp16_backend": "auto", "push_to_hub_model_id": null, ' + '"push_to_hub_organization": null, "push_to_hub_token": "", ' + '"mp_parameters": "", "auto_find_batch_size": false, "full_determinism": ' + 'false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, ' + '"torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": ' + 'null, "dispatch_batches": null, "split_batches": null, "include_tokens_per_' + 'second": false, "include_num_input_tokens_seen": false, ' + '"neftune_noise_alpha": null}\'', + ], + resources=models.V1ResourceRequirements( + requests={"cpu": "1", "memory": "1Gi"}, + limits={"cpu": "1", "memory": "1Gi"}, + ), + volume_mounts=[ + training_models.V1VolumeMount( + name="storage-initializer", + mount_path="/workspace", + ) + ], + ) + ] + + expected_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + init_containers=expected_init_container, + containers=expected_container, + volumes=[ + models.V1Volume( + name="storage-initializer", + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=exp_name + ), + ) + ], + restart_policy="Never", + ), + ) + + expected_job = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=expected_pod, + ), + ) + + expected_trial_template = models.V1beta1TrialTemplate( + primary_container_name="training-container", + trial_parameters=[ + models.V1beta1TrialParameterSpec( + name="learning_rate", reference="learning_rate" + ), + models.V1beta1TrialParameterSpec(name="r", reference="r"), + ], + retain=False, + trial_spec=expected_job, + ) + + expected_parameters = [ + models.V1beta1ParameterSpec( + name="learning_rate", + parameter_type="double", + feasible_space=models.V1beta1FeasibleSpace(min="1e-05", max="5e-05"), + ), + models.V1beta1ParameterSpec( + name="r", + parameter_type="int", + feasible_space=models.V1beta1FeasibleSpace(min="8", max="32"), + ), + ] + + self.assertEqual(experiment.spec.objective.type, "maximize") + self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") + self.assertEqual(experiment.spec.objective.goal, 0.9) + self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") + self.assertEqual(experiment.spec.max_trial_count, 10) + self.assertEqual(experiment.spec.parallel_trial_count, 2) + self.assertEqual(experiment.spec.max_failed_trial_count, 1) + self.assertEqual(experiment.spec.parameters, expected_parameters) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.template.spec.init_containers, + expected_init_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.template, expected_pod + ) + self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) + self.assertEqual(experiment.spec.trial_template, expected_trial_template) + + +if __name__ == "__main__": + unittest.main() From 3edfb49b438fb0cbac8bb05a55ad1f1dd45e15d4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 17:37:49 +0800 Subject: [PATCH 50/53] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 2 +- .../v1beta1/kubeflow/katib/types/trainer_resources.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py index 259f78db83a..73f1110cd5a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -4,4 +4,4 @@ from kubeflow.katib.types.trainer_resources import TrainerResources # Import Kubernetes models. -from kubernetes.client import * \ No newline at end of file +from kubernetes.client import * diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py index b147b577e2a..9dacc5eb247 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -111,9 +111,11 @@ def to_dict(self): elif isinstance(value, dict): result[attr] = dict( map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") - else item, + lambda item: ( + (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item + ), value.items(), ) ) From ccdc61290a2cf5d31a10b5fb1def5173905ec32d Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 18:05:14 +0800 Subject: [PATCH 51/53] fix format Signed-off-by: helenxie-bit --- test/unit/v1beta1/tune-api/test_tune_api.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/test/unit/v1beta1/tune-api/test_tune_api.py b/test/unit/v1beta1/tune-api/test_tune_api.py index af3795f9d74..085266f72d8 100644 --- a/test/unit/v1beta1/tune-api/test_tune_api.py +++ b/test/unit/v1beta1/tune-api/test_tune_api.py @@ -1,20 +1,22 @@ import unittest from unittest import TestCase -from unittest.mock import Mock, patch +from unittest.mock import Mock +from unittest.mock import patch +from kubeflow.katib import KatibClient +from kubeflow.katib import models +from kubeflow.katib import types import kubeflow.katib as katib -from kubeflow.katib import KatibClient, models, types -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, - HuggingFaceModelParams, - HuggingFaceTrainerParams, -) +from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams from kubeflow.training import models as training_models from kubernetes import client from kubernetes.client.exceptions import ApiException from peft import LoraConfig import transformers + class TestTuneAPI(TestCase): # Create an instance of the KatibClient def setUp(self): From fa6c9d7bf5be42e9e07c68eb22f47399bdea00d6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 18:58:11 +0800 Subject: [PATCH 52/53] add e2e test for tune api with llm hyperparameters optimization Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 5 + .../scripts/gh-actions/run-e2e-tune-api.py | 240 +++++++++++++++++- 2 files changed, 239 insertions(+), 6 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index e1f37a3701b..2a6cd824bac 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -22,10 +22,15 @@ jobs: with: kubernetes-version: ${{ matrix.kubernetes-version }} + - name: Install Training Operator SDK + shell: bash + run: pip install kubeflow[huggingface] + - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test with: tune-api: true + training-operator: true strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 1ca3596af95..46bce268975 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -2,10 +2,19 @@ import logging from kubeflow.katib import KatibClient -from kubeflow.katib import search +from kubeflow.katib import search, types from kubernetes import client from verify import verify_experiment_results +import transformers +from peft import LoraConfig + +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceDatasetParams, + HuggingFaceTrainerParams, +) + # Experiment timeout is 40 min. EXPERIMENT_TIMEOUT = 60 * 40 @@ -13,7 +22,8 @@ logging.basicConfig(level=logging.INFO) -def run_e2e_experiment_create_by_tune( +# Test for Experiment created with custom objective and non-distributed training. +def run_e2e_experiment_create_by_tune_with_custom_objective_non_distributed( katib_client: KatibClient, exp_name: str, exp_namespace: str, @@ -58,6 +68,182 @@ def objective(parameters): logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) +# Test for Experiment created with custom objective and distributed training. +def run_e2e_experiment_create_by_tune_with_custom_objective_distributed( + katib_client: KatibClient, + exp_name: str, + exp_namespace: str, +): + # Create Katib Experiment and wait until it is finished. + logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) + + # Use the test case from get-started tutorial. + # https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk + # [1] Create an objective function. + def objective(parameters): + import time + time.sleep(5) + result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2 + print(f"result={result}") + + # [2] Create hyperparameter search space. + parameters = { + "a": search.int(min=10, max=20), + "b": search.double(min=0.1, max=0.2) + } + + # [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial. + # And Wait until Experiment reaches Succeeded condition. + katib_client.tune( + name=exp_name, + namespace=exp_namespace, + objective=objective, + parameters=parameters, + objective_metric_name="result", + max_trial_count=4, + resources_per_trial=types.TrainerResources( + num_workers=1, + num_procs_per_worker=1, + resources_per_worker={"cpu": "2"}, + ), + ) + experiment = katib_client.wait_for_experiment_condition( + exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT + ) + + # Verify the Experiment results. + verify_experiment_results(katib_client, experiment, exp_name, exp_namespace) + + # Print the Experiment and Suggestion. + logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) + logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) + +# Test for Experiment created with external models and datasets and non-distributed training. +def run_e2e_experiment_create_by_tune_with_external_model_non_distributed( + katib_client: KatibClient, + exp_name: str, + exp_namespace: str, +): + # Create Katib Experiment and wait until it is finished. + logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) + + # Use the test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + # Create Katib Experiment. + # And Wait until Experiment reaches Succeeded condition. + katib_client.tune( + name=exp_name, + namespace=exp_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate = search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r = search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name = "train_loss", + objective_type = "minimize", + algorithm_name = "random", + max_trial_count = 1, + parallel_trial_count = 1, + resources_per_trial={ + "cpu": "4", + "memory": "10G", + }, + ) + experiment = katib_client.wait_for_experiment_condition( + exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT + ) + + # Verify the Experiment results. + verify_experiment_results(katib_client, experiment, exp_name, exp_namespace) + + # Print the Experiment and Suggestion. + logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) + logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) + +# Test for Experiment created with external models and datasets and non-distributed training. +def run_e2e_experiment_create_by_tune_with_external_model_distributed( + katib_client: KatibClient, + exp_name: str, + exp_namespace: str, +): + # Create Katib Experiment and wait until it is finished. + logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) + + # Use the test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + # Create Katib Experiment. + # And Wait until Experiment reaches Succeeded condition. + katib_client.tune( + name=exp_name, + namespace=exp_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate = search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r = search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name = "train_loss", + objective_type = "minimize", + algorithm_name = "random", + max_trial_count = 1, + parallel_trial_count = 1, + resources_per_trial=types.TrainerResources( + num_workers=1, + num_procs_per_worker=1, + resources_per_worker={"cpu": "4", "memory": "10G",}, + ), + ) + experiment = katib_client.wait_for_experiment_condition( + exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT + ) + + # Verify the Experiment results. + verify_experiment_results(katib_client, experiment, exp_name, exp_namespace) + + # Print the Experiment and Suggestion. + logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) + logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -83,15 +269,57 @@ def objective(parameters): exp_name = "tune-example" exp_namespace = args.namespace try: - run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace) + run_e2e_experiment_create_by_tune_with_custom_objective_non_distributed(katib_client, f"{exp_name}-1", exp_namespace) + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}") + except Exception as e: + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}") + raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) + + try: + run_e2e_experiment_create_by_tune_with_custom_objective_distributed(katib_client, f"{exp_name}-2", exp_namespace) + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}") + except Exception as e: + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}") + raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(f"{exp_name}-2", exp_namespace) + + try: + run_e2e_experiment_create_by_tune_with_external_model_non_distributed(katib_client, f"{exp_name}-3", exp_namespace) + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-3"}") + except Exception as e: + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-3"}") + raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(f"{exp_name}-3", exp_namespace) + + try: + run_e2e_experiment_create_by_tune_with_custom_objective_distributed(katib_client, f"{exp_name}-4", exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-4"}") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-4"}") raise e finally: # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(exp_name, exp_namespace) + katib_client.delete_experiment(f"{exp_name}-4", exp_namespace) From 752c7129149e57d0384ad1974f18acb7741e4819 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 19:12:17 +0800 Subject: [PATCH 53/53] fix format Signed-off-by: helenxie-bit --- .../scripts/gh-actions/run-e2e-tune-api.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 46bce268975..d6b36d650c1 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -2,18 +2,15 @@ import logging from kubeflow.katib import KatibClient -from kubeflow.katib import search, types +from kubeflow.katib import search +from kubeflow.katib import types +from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams from kubernetes import client -from verify import verify_experiment_results - -import transformers from peft import LoraConfig - -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceDatasetParams, - HuggingFaceTrainerParams, -) +import transformers +from verify import verify_experiment_results # Experiment timeout is 40 min. EXPERIMENT_TIMEOUT = 60 * 40