diff --git a/.github/workflows/integrate.yaml b/.github/workflows/integrate.yaml index 37deed4b..1e39de6e 100644 --- a/.github/workflows/integrate.yaml +++ b/.github/workflows/integrate.yaml @@ -42,6 +42,40 @@ jobs: - name: Check flake8 run: flake8 ./charms/*/src + charm-integration: + name: Integration tests (microk8s) + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + charm: + - controller + steps: + - uses: actions/checkout@v3 + - name: Setup operator environment + uses: charmed-kubernetes/actions-operator@main + with: + provider: microk8s + channel: 1.24/stable + juju-channel: 2.9/stable + charmcraft-channel: latest/candidate + + # TODO: Remove once the actions-operator does this automatically + - name: Configure kubectl + run: | + sg microk8s -c "microk8s config > ~/.kube/config" + + - run: | + sg microk8s -c "tox -e ${{ matrix.charm }}-integration" + + # Collect debug logs if failed + - name: Dump Juju/k8s logs on failure + uses: canonical/kubeflow-ci/actions/dump-charm-debug-artifacts@main + if: always() + with: + app: ${{ matrix.charm }} + model: testing + test-bundle: name: Test the bundle runs-on: ubuntu-20.04 diff --git a/README.md b/README.md index 794481a0..891dba75 100644 --- a/README.md +++ b/README.md @@ -65,3 +65,39 @@ available charm run: If you aim to use Katib within an existing Kubeflow deployment in order to use it within the Kubeflow dashboard, you will have to integrate `katib-ui` to `istio-pilot` with the following command: juju relate istio-pilot katib-ui + +## Setting Custom Images for Katib Controller + +Katib controller comes with a set of preconfigured images that are used in Katib workloads. This is the list of default images used in charm. + +```json +{ + "default_trial_template": "docker.io/kubeflowkatib/mxnet-mnist:v0.15.0", + "early_stopping__medianstop": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.15.0", + "enas_cpu_template": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0", + "metrics_collector_sidecar__stdout": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0", + "metrics_collector_sidecar__file": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0", + "metrics_collector_sidecar__tensorflow_event": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0", + "pytorch_job_template__master": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0", + "pytorch_job_template__worker": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0", + "suggestion__random": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0", + "suggestion__tpe": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0", + "suggestion__grid": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0", + "suggestion__hyperband": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0", + "suggestion__bayesianoptimization": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0", + "suggestion__cmaes": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0", + "suggestion__sobol": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0", + "suggestion__multivariate_tpe": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0", + "suggestion__enas": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0", + "suggestion__darts": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0", + "suggestion__pbt": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0", +} +``` + +These images can be overridden in the charm configuration under custom_images in the charms/katib-controller/config.yaml file. Whenever you leave the image field empty in the config, the default image will be used. You can specify your own images with the config by filling one or multiple entries. The config accepts either YAML or JSON entries. For example. + +``` +juju config katib-controller custom_images='{"default_trial_template": "custom:1.0", "early_stopping__medianstop": "cuustom:2.1"}' +``` + +These images are being used in *.j2 files under charms/katib-controller/src/templates/*.j2. diff --git a/charms/katib-controller/config.yaml b/charms/katib-controller/config.yaml index f771f5e9..a4efdae0 100644 --- a/charms/katib-controller/config.yaml +++ b/charms/katib-controller/config.yaml @@ -7,3 +7,28 @@ options: type: int default: 8080 description: Metrics port + custom_images: + type: string + default: | + default_trial_template: '' + early_stopping__medianstop: '' + enas_cpu_template: '' + metrics_collector_sidecar__stdout: '' + metrics_collector_sidecar__file: '' + metrics_collector_sidecar__tensorflow_event: '' + pytorch_job_template__master: '' + pytorch_job_template__worker: '' + suggestion__random: '' + suggestion__tpe: '' + suggestion__grid: '' + suggestion__hyperband: '' + suggestion__bayesianoptimization: '' + suggestion__cmaes: '' + suggestion__sobol: '' + suggestion__multivariate_tpe: '' + suggestion__enas: '' + suggestion__darts: '' + suggestion__pbt: '' + description: > + YAML or JSON formatted input defining images to use in Katib + For usage details, see https://github.com/canonical/katib-operators. diff --git a/charms/katib-controller/requirements-integration.in b/charms/katib-controller/requirements-integration.in new file mode 100644 index 00000000..fd6f9561 --- /dev/null +++ b/charms/katib-controller/requirements-integration.in @@ -0,0 +1,3 @@ +juju==2.9.44 +lightkube +pytest-operator diff --git a/charms/katib-controller/requirements-integration.txt b/charms/katib-controller/requirements-integration.txt new file mode 100644 index 00000000..76c7b3f2 --- /dev/null +++ b/charms/katib-controller/requirements-integration.txt @@ -0,0 +1,204 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile requirements-integration.in +# +anyio==3.7.1 + # via httpcore +asttokens==2.2.1 + # via stack-data +backcall==0.2.0 + # via ipython +bcrypt==4.0.1 + # via paramiko +cachetools==5.3.1 + # via google-auth +certifi==2023.7.22 + # via + # httpcore + # httpx + # kubernetes + # requests +cffi==1.15.1 + # via + # cryptography + # pynacl +charset-normalizer==3.2.0 + # via requests +cryptography==41.0.2 + # via paramiko +decorator==5.1.1 + # via + # ipdb + # ipython +exceptiongroup==1.1.2 + # via + # anyio + # pytest +executing==1.2.0 + # via stack-data +google-auth==2.22.0 + # via kubernetes +h11==0.14.0 + # via httpcore +httpcore==0.17.3 + # via httpx +httpx==0.24.1 + # via lightkube +idna==3.4 + # via + # anyio + # httpx + # requests +iniconfig==2.0.0 + # via pytest +ipdb==0.13.13 + # via pytest-operator +ipython==8.12.2 + # via ipdb +jedi==0.18.2 + # via ipython +jinja2==3.1.2 + # via pytest-operator +juju==2.9.44 + # via + # -r requirements-integration.in + # pytest-operator +jujubundlelib==0.5.7 + # via theblues +kubernetes==27.2.0 + # via juju +lightkube==0.14.0 + # via -r requirements-integration.in +lightkube-models==1.27.1.4 + # via lightkube +macaroonbakery==1.3.1 + # via + # juju + # theblues +markupsafe==2.1.3 + # via jinja2 +matplotlib-inline==0.1.6 + # via ipython +mypy-extensions==1.0.0 + # via typing-inspect +oauthlib==3.2.2 + # via + # kubernetes + # requests-oauthlib +packaging==23.1 + # via pytest +paramiko==2.12.0 + # via juju +parso==0.8.3 + # via jedi +pexpect==4.8.0 + # via ipython +pickleshare==0.7.5 + # via ipython +pluggy==1.2.0 + # via pytest +prompt-toolkit==3.0.39 + # via ipython +protobuf==3.20.3 + # via macaroonbakery +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.2 + # via stack-data +pyasn1==0.5.0 + # via + # juju + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 + # via google-auth +pycparser==2.21 + # via cffi +pygments==2.15.1 + # via ipython +pymacaroons==0.13.0 + # via macaroonbakery +pynacl==1.5.0 + # via + # macaroonbakery + # paramiko + # pymacaroons +pyrfc3339==1.1 + # via + # juju + # macaroonbakery +pytest==7.4.0 + # via + # pytest-asyncio + # pytest-operator +pytest-asyncio==0.21.1 + # via pytest-operator +pytest-operator==0.28.0 + # via -r requirements-integration.in +python-dateutil==2.8.2 + # via kubernetes +pytz==2023.3 + # via pyrfc3339 +pyyaml==6.0.1 + # via + # juju + # jujubundlelib + # kubernetes + # lightkube + # pytest-operator +requests==2.31.0 + # via + # kubernetes + # macaroonbakery + # requests-oauthlib + # theblues +requests-oauthlib==1.3.1 + # via kubernetes +rsa==4.9 + # via google-auth +six==1.16.0 + # via + # google-auth + # kubernetes + # macaroonbakery + # paramiko + # pymacaroons + # python-dateutil +sniffio==1.3.0 + # via + # anyio + # httpcore + # httpx +stack-data==0.6.2 + # via ipython +theblues==0.5.2 + # via juju +tomli==2.0.1 + # via + # ipdb + # pytest +toposort==1.10 + # via juju +traitlets==5.9.0 + # via + # ipython + # matplotlib-inline +typing-extensions==4.7.1 + # via + # ipython + # typing-inspect +typing-inspect==0.9.0 + # via juju +urllib3==1.26.16 + # via + # google-auth + # kubernetes + # requests +wcwidth==0.2.6 + # via prompt-toolkit +websocket-client==1.5.3 + # via kubernetes +websockets==7.0 + # via juju diff --git a/charms/katib-controller/requirements.in b/charms/katib-controller/requirements.in index 7a994037..4801d2f6 100644 --- a/charms/katib-controller/requirements.in +++ b/charms/katib-controller/requirements.in @@ -1,2 +1,5 @@ +jinja2 ops oci-image +# Pinning until this resolves https://github.com/yaml/pyyaml/issues/724 +pyyaml==6.0.1 diff --git a/charms/katib-controller/requirements.txt b/charms/katib-controller/requirements.txt index f6ca8a27..6422cc9e 100644 --- a/charms/katib-controller/requirements.txt +++ b/charms/katib-controller/requirements.txt @@ -4,11 +4,17 @@ # # pip-compile requirements.in # +jinja2==3.1.2 + # via -r requirements.in +markupsafe==2.1.3 + # via jinja2 oci-image==1.0.0 # via -r requirements.in ops==2.3.0 # via -r requirements.in -pyyaml==6.0 - # via ops +pyyaml==6.0.1 + # via + # -r requirements.in + # ops websocket-client==1.5.3 # via ops diff --git a/charms/katib-controller/src/charm.py b/charms/katib-controller/src/charm.py index 61aa3b9e..eee51d65 100755 --- a/charms/katib-controller/src/charm.py +++ b/charms/katib-controller/src/charm.py @@ -6,20 +6,92 @@ from base64 import b64encode from pathlib import Path from subprocess import check_call +from typing import Dict import yaml from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider -from jinja2 import Environment, FileSystemLoader +from jinja2 import Environment, FileSystemLoader, Template from oci_image import OCIImageResource, OCIImageResourceError from ops.charm import CharmBase from ops.framework import StoredState from ops.main import main -from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus +from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus + +DEFAULT_IMAGES = { + "default_trial_template": "docker.io/kubeflowkatib/mxnet-mnist:v0.15.0", + "early_stopping__medianstop": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.15.0", + "enas_cpu_template": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0", + "metrics_collector_sidecar__stdout": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0", + "metrics_collector_sidecar__file": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0", + "metrics_collector_sidecar__tensorflow_event": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0", # noqa: E501 + "pytorch_job_template__master": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0", + "pytorch_job_template__worker": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0", + "suggestion__random": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0", + "suggestion__tpe": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0", + "suggestion__grid": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0", + "suggestion__hyperband": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0", + "suggestion__bayesianoptimization": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0", + "suggestion__cmaes": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0", + "suggestion__sobol": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0", + "suggestion__multivariate_tpe": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0", + "suggestion__enas": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0", + "suggestion__darts": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0", + "suggestion__pbt": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0", +} logger = logging.getLogger(__name__) +def parse_images_config(config: str) -> Dict: + """ + Parse a YAML config-defined images list. + + This function takes a YAML-formatted string 'config' containing a list of images + and returns a dictionary representing the images. + + Args: + config (str): YAML-formatted string representing a list of images. + + Returns: + Dict: A list of images. + """ + error_message = ( + f"Cannot parse a config-defined images list from config '{config}' - this" + "config input will be ignored." + ) + if not config: + return [] + try: + images = yaml.safe_load(config) + except yaml.YAMLError as err: + logger.warning( + f"{error_message} Got error: {err}, while parsing the custom_image config." + ) + raise CheckFailed(error_message, BlockedStatus) + return images + + +def render_template(template_path: str, context: Dict) -> str: + """ + Render a Jinja2 template. + + This function takes the file path of a Jinja2 template and a context dictionary + containing the variables for template rendering. It loads the template, + substitutes the variables in the context, and returns the rendered content. + + Args: + template_path (str): The file path of the Jinja2 template. + context (Dict): A dictionary containing the variables for template rendering. + + Returns: + str: The rendered template content. + """ + template = Template(Path(template_path).read_text()) + rendered_template = template.render(**context) + return rendered_template + + class CheckFailed(Exception): """Raise this exception if one of the checks in main fails.""" @@ -41,7 +113,8 @@ def __init__(self, framework): self._stored.set_default(**self.gen_certs()) self.image = OCIImageResource(self, "oci-image") - + self.custom_images = [] + self.images_context = {} self.env = Environment(loader=FileSystemLoader("src/")) self.prometheus_provider = MetricsEndpointProvider( @@ -63,12 +136,43 @@ def __init__(self, framework): ]: self.framework.observe(event, self.set_pod_spec) + def get_images( + self, default_images: Dict[str, str], custom_images: Dict[str, str] + ) -> Dict[str, str]: + """ + Combine default images with custom images. + + This function takes two dictionaries, 'default_images' and 'custom_images', + representing the default set of images and the custom set of images respectively. + It combines the custom images into the default image list, overriding any matching + image names from the default list with the custom ones. + + Args: + default_images (Dict[str, str]): A dictionary containing the default image names + as keys and their corresponding default image URIs as values. + custom_images (Dict[str, str]): A dictionary containing the custom image names + as keys and their corresponding custom image URIs as values. + + Returns: + Dict[str, str]: A dictionary representing the combined images, where image names + from the custom_images override any matching image names from the default_images. + """ + images = default_images + for image_name, custom_image in custom_images.items(): + if custom_image: + if image_name in images: + images[image_name] = custom_image + else: + logger.warning(f"image_name {image_name} not in image list, ignoring.") + return images + def set_pod_spec(self, event): self.model.unit.status = MaintenanceStatus("Setting pod spec") try: self._check_leader() - + self.custom_images = parse_images_config(self.model.config["custom_images"]) + self.images_context = self.get_images(DEFAULT_IMAGES, self.custom_images) image_details = self._check_image_details() except CheckFailed as check_failed: self.model.unit.status = check_failed.status @@ -198,7 +302,7 @@ def set_pod_spec(self, event): }, "configMaps": { "katib-config": { - f: Path(f"src/{f}.json").read_text() + f: render_template(f"src/templates/{f}.json.j2", self.images_context) for f in ( "metrics-collector-sidecar", "suggestion", @@ -206,7 +310,10 @@ def set_pod_spec(self, event): ) }, "trial-template": { - f + suffix: Path(f"src/{f}.yaml").read_text() + f + + suffix: render_template( + f"src/templates/{f}.yaml.j2", self.images_context + ) for f, suffix in ( ("defaultTrialTemplate", ".yaml"), ("enasCPUTemplate", ""), @@ -221,7 +328,7 @@ def set_pod_spec(self, event): def _rendered_webhook_definitions(self): ca_crt = b64encode(self._stored.ca.encode("ascii")).decode("utf-8") - yaml_file = self.env.get_template("webhooks.yaml").render(ca_bundle=ca_crt) + yaml_file = self.env.get_template("templates/webhooks.yaml.j2").render(ca_bundle=ca_crt) validating, mutating = yaml.safe_load_all(yaml_file) return validating, mutating diff --git a/charms/katib-controller/src/early-stopping.json b/charms/katib-controller/src/early-stopping.json deleted file mode 100644 index cf60ad6b..00000000 --- a/charms/katib-controller/src/early-stopping.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "medianstop": { - "image": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.15.0" - } -} diff --git a/charms/katib-controller/src/metrics-collector-sidecar.json b/charms/katib-controller/src/metrics-collector-sidecar.json deleted file mode 100644 index fb4f1834..00000000 --- a/charms/katib-controller/src/metrics-collector-sidecar.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "StdOut": { - "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0" - }, - "File": { - "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0" - }, - "TensorFlowEvent": { - "image": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0", - "resources": { - "limits": { - "memory": "1Gi" - } - } - } -} diff --git a/charms/katib-controller/src/suggestion.json b/charms/katib-controller/src/suggestion.json deleted file mode 100644 index 10bbe15b..00000000 --- a/charms/katib-controller/src/suggestion.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "random": { - "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0" - }, - "tpe": { - "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0" - }, - "grid": { - "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0" - }, - "hyperband": { - "image": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0" - }, - "bayesianoptimization": { - "image": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0" - }, - "cmaes": { - "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0" - }, - "sobol": { - "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0" - }, - "multivariate-tpe": { - "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0" - }, - "enas": { - "image": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0", - "resources": { - "limits": { - "memory": "200Mi" - } - } - }, - "darts": { - "image": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0" - }, - "pbt": { - "image": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0", - "persistentVolumeClaimSpec": { - "accessModes": [ - "ReadWriteMany" - ], - "resources": { - "requests": { - "storage": "5Gi" - } - } - } - } -} diff --git a/charms/katib-controller/src/defaultTrialTemplate.yaml b/charms/katib-controller/src/templates/defaultTrialTemplate.yaml.j2 similarity index 87% rename from charms/katib-controller/src/defaultTrialTemplate.yaml rename to charms/katib-controller/src/templates/defaultTrialTemplate.yaml.j2 index 06d37967..aaef44ac 100644 --- a/charms/katib-controller/src/defaultTrialTemplate.yaml +++ b/charms/katib-controller/src/templates/defaultTrialTemplate.yaml.j2 @@ -5,7 +5,7 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:v0.15.0 + image: {{ default_trial_template }} command: - "python3" - "/opt/mxnet-mnist/mnist.py" diff --git a/charms/katib-controller/src/templates/early-stopping.json.j2 b/charms/katib-controller/src/templates/early-stopping.json.j2 new file mode 100644 index 00000000..8da4086d --- /dev/null +++ b/charms/katib-controller/src/templates/early-stopping.json.j2 @@ -0,0 +1,5 @@ +{ + "medianstop": { + "image": "{{ early_stopping__medianstop }}" + } +} diff --git a/charms/katib-controller/src/enasCPUTemplate.yaml b/charms/katib-controller/src/templates/enasCPUTemplate.yaml.j2 similarity index 85% rename from charms/katib-controller/src/enasCPUTemplate.yaml rename to charms/katib-controller/src/templates/enasCPUTemplate.yaml.j2 index 4965bd8e..5a663abc 100644 --- a/charms/katib-controller/src/enasCPUTemplate.yaml +++ b/charms/katib-controller/src/templates/enasCPUTemplate.yaml.j2 @@ -5,7 +5,7 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0 + image: {{ enas_cpu_template }} command: - python3 - -u diff --git a/charms/katib-controller/src/templates/metrics-collector-sidecar.json.j2 b/charms/katib-controller/src/templates/metrics-collector-sidecar.json.j2 new file mode 100644 index 00000000..dc90b69c --- /dev/null +++ b/charms/katib-controller/src/templates/metrics-collector-sidecar.json.j2 @@ -0,0 +1,16 @@ +{ + "StdOut": { + "image": "{{ metrics_collector_sidecar__stdout }}" + }, + "File": { + "image": "{{ metrics_collector_sidecar__file }}" + }, + "TensorFlowEvent": { + "image": "{{ metrics_collector_sidecar__tensorflow_event }}", + "resources": { + "limits": { + "memory": "1Gi" + } + } + } +} diff --git a/charms/katib-controller/src/pytorchJobTemplate.yaml b/charms/katib-controller/src/templates/pytorchJobTemplate.yaml.j2 similarity index 85% rename from charms/katib-controller/src/pytorchJobTemplate.yaml rename to charms/katib-controller/src/templates/pytorchJobTemplate.yaml.j2 index 29e0213f..98b9228e 100644 --- a/charms/katib-controller/src/pytorchJobTemplate.yaml +++ b/charms/katib-controller/src/templates/pytorchJobTemplate.yaml.j2 @@ -9,7 +9,7 @@ spec: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0 + image: {{ pytorch_job_template__master }} command: - "python3" - "/opt/pytorch-mnist/mnist.py" @@ -23,7 +23,7 @@ spec: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0 + image: {{ pytorch_job_template__worker }} command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/charms/katib-controller/src/templates/suggestion.json.j2 b/charms/katib-controller/src/templates/suggestion.json.j2 new file mode 100644 index 00000000..06563fa6 --- /dev/null +++ b/charms/katib-controller/src/templates/suggestion.json.j2 @@ -0,0 +1,50 @@ +{ + "random": { + "image": "{{ suggestion__random }}" + }, + "tpe": { + "image": "{{ suggestion__tpe }}" + }, + "grid": { + "image": "{{ suggestion__grid }}" + }, + "hyperband": { + "image": "{{ suggestion__hyperband }}" + }, + "bayesianoptimization": { + "image": "{{ suggestion__bayesianoptimization }}" + }, + "cmaes": { + "image": "{{ suggestion__cmaes }}" + }, + "sobol": { + "image": "{{ suggestion__sobol }}" + }, + "multivariate-tpe": { + "image": "{{ suggestion__multivariate_tpe }}" + }, + "enas": { + "image": "{{ suggestion__enas }}", + "resources": { + "limits": { + "memory": "200Mi" + } + } + }, + "darts": { + "image": "{{ suggestion__darts }}" + }, + "pbt": { + "image": "{{ suggestion__pbt }}", + "persistentVolumeClaimSpec": { + "accessModes": [ + "ReadWriteMany" + ], + "resources": { + "requests": { + "storage": "5Gi" + } + } + } + } +} diff --git a/charms/katib-controller/src/webhooks.yaml b/charms/katib-controller/src/templates/webhooks.yaml.j2 similarity index 100% rename from charms/katib-controller/src/webhooks.yaml rename to charms/katib-controller/src/templates/webhooks.yaml.j2 diff --git a/charms/katib-controller/tests/integration/test_charm.py b/charms/katib-controller/tests/integration/test_charm.py new file mode 100644 index 00000000..f9ae6c80 --- /dev/null +++ b/charms/katib-controller/tests/integration/test_charm.py @@ -0,0 +1,98 @@ +from pathlib import Path + +import lightkube +import pytest +import yaml +from lightkube.resources.core_v1 import ConfigMap +from pytest_operator.plugin import OpsTest + +METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) +CHARM_NAME = METADATA["name"] +KATIB_CONFIG = "katib-config" +TRIAL_TEMPLATE = "trial-template" +EXPECTED_KATIB_CONFIG = { + "early-stopping": '{\n "medianstop": {\n "image": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.15.0"\n }\n}', # noqa: E501 + "metrics-collector-sidecar": '{\n "StdOut": {\n "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0"\n },\n "File": {\n "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0"\n },\n "TensorFlowEvent": {\n "image": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0",\n "resources": {\n "limits": {\n "memory": "1Gi"\n }\n }\n }\n}', # noqa: E501 + "suggestion": '{\n "random": {\n "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0"\n },\n "tpe": {\n "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0"\n },\n "grid": {\n "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0"\n },\n "hyperband": {\n "image": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0"\n },\n "bayesianoptimization": {\n "image": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0"\n },\n "cmaes": {\n "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0"\n },\n "sobol": {\n "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0"\n },\n "multivariate-tpe": {\n "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0"\n },\n "enas": {\n "image": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0",\n "resources": {\n "limits": {\n "memory": "200Mi"\n }\n }\n },\n "darts": {\n "image": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0"\n },\n "pbt": {\n "image": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0",\n "persistentVolumeClaimSpec": {\n "accessModes": [\n "ReadWriteMany"\n ],\n "resources": {\n "requests": {\n "storage": "5Gi"\n }\n }\n }\n }\n}', # noqa: E501 +} +EXPECTED_KATIB_CONFIG_CHANGED = { + "early-stopping": '{\n "medianstop": {\n "image": "custom:2.1"\n }\n}', # noqa: E501 + "metrics-collector-sidecar": '{\n "StdOut": {\n "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0"\n },\n "File": {\n "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0"\n },\n "TensorFlowEvent": {\n "image": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0",\n "resources": {\n "limits": {\n "memory": "1Gi"\n }\n }\n }\n}', # noqa: E501 + "suggestion": '{\n "random": {\n "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0"\n },\n "tpe": {\n "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0"\n },\n "grid": {\n "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0"\n },\n "hyperband": {\n "image": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0"\n },\n "bayesianoptimization": {\n "image": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0"\n },\n "cmaes": {\n "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0"\n },\n "sobol": {\n "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0"\n },\n "multivariate-tpe": {\n "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0"\n },\n "enas": {\n "image": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0",\n "resources": {\n "limits": {\n "memory": "200Mi"\n }\n }\n },\n "darts": {\n "image": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0"\n },\n "pbt": {\n "image": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0",\n "persistentVolumeClaimSpec": {\n "accessModes": [\n "ReadWriteMany"\n ],\n "resources": {\n "requests": {\n "storage": "5Gi"\n }\n }\n }\n }\n}', # noqa: E501 +} +EXPECTED_TRIAL_TEMPLATE = { + "defaultTrialTemplate.yaml": 'apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/mxnet-mnist:v0.15.0\n command:\n - "python3"\n - "/opt/mxnet-mnist/mnist.py"\n - "--batch-size=64"\n - "--lr=${trialParameters.learningRate}"\n - "--num-layers=${trialParameters.numberLayers}"\n - "--optimizer=${trialParameters.optimizer}"\n restartPolicy: Never', # noqa: E501 + "enasCPUTemplate": 'apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0\n command:\n - python3\n - -u\n - RunTrial.py\n - --num_epochs=1\n - "--architecture=\\"${trialParameters.neuralNetworkArchitecture}\\""\n - "--nn_config=\\"${trialParameters.neuralNetworkConfig}\\""\n restartPolicy: Never', # noqa: E501 + "pytorchJobTemplate": 'apiVersion: kubeflow.org/v1\nkind: PyTorchJob\nspec:\n pytorchReplicaSpecs:\n Master:\n replicas: 1\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0\n command:\n - "python3"\n - "/opt/pytorch-mnist/mnist.py"\n - "--epochs=1"\n - "--lr=${trialParameters.learningRate}"\n - "--momentum=${trialParameters.momentum}"\n Worker:\n replicas: 2\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0\n command:\n - "python3"\n - "/opt/pytorch-mnist/mnist.py"\n - "--epochs=1"\n - "--lr=${trialParameters.learningRate}"\n - "--momentum=${trialParameters.momentum}"', # noqa: E501 +} +EXPECTED_TRIAL_TEMPLATE_CHANGED = { + "defaultTrialTemplate.yaml": 'apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: custom:1.0\n command:\n - "python3"\n - "/opt/mxnet-mnist/mnist.py"\n - "--batch-size=64"\n - "--lr=${trialParameters.learningRate}"\n - "--num-layers=${trialParameters.numberLayers}"\n - "--optimizer=${trialParameters.optimizer}"\n restartPolicy: Never', # noqa: E501 + "enasCPUTemplate": 'apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0\n command:\n - python3\n - -u\n - RunTrial.py\n - --num_epochs=1\n - "--architecture=\\"${trialParameters.neuralNetworkArchitecture}\\""\n - "--nn_config=\\"${trialParameters.neuralNetworkConfig}\\""\n restartPolicy: Never', # noqa: E501 + "pytorchJobTemplate": 'apiVersion: kubeflow.org/v1\nkind: PyTorchJob\nspec:\n pytorchReplicaSpecs:\n Master:\n replicas: 1\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0\n command:\n - "python3"\n - "/opt/pytorch-mnist/mnist.py"\n - "--epochs=1"\n - "--lr=${trialParameters.learningRate}"\n - "--momentum=${trialParameters.momentum}"\n Worker:\n replicas: 2\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0\n command:\n - "python3"\n - "/opt/pytorch-mnist/mnist.py"\n - "--epochs=1"\n - "--lr=${trialParameters.learningRate}"\n - "--momentum=${trialParameters.momentum}"', # noqa: E501 +} + + +@pytest.fixture(scope="session") +def lightkube_client() -> lightkube.Client: + client = lightkube.Client(field_manager=CHARM_NAME) + return client + + +class TestCharm: + @pytest.mark.abort_on_fail + async def test_build_and_deploy(self, ops_test: OpsTest): + """Build and deploy the charm. + + Assert on the unit status. + """ + charm_under_test = await ops_test.build_charm(".") + image_path = METADATA["resources"]["oci-image"]["upstream-source"] + resources = {"oci-image": image_path} + + await ops_test.model.deploy( + charm_under_test, resources=resources, application_name=CHARM_NAME + ) + + await ops_test.model.wait_for_idle( + apps=[CHARM_NAME], status="active", raise_on_blocked=True, timeout=300 + ) + + async def test_configmap_created(self, lightkube_client: lightkube.Client, ops_test: OpsTest): + """Test configmaps contents with default coonfig.""" + katib_config_cm = lightkube_client.get( + ConfigMap, KATIB_CONFIG, namespace=ops_test.model_name + ) + trial_template_cm = lightkube_client.get( + ConfigMap, TRIAL_TEMPLATE, namespace=ops_test.model_name + ) + + assert katib_config_cm.data == EXPECTED_KATIB_CONFIG + assert trial_template_cm.data == EXPECTED_TRIAL_TEMPLATE + + async def test_configmap_changes_with_config( + self, lightkube_client: lightkube.Client, ops_test: OpsTest + ): + await ops_test.model.applications[CHARM_NAME].set_config( + { + "custom_images": '{"default_trial_template": "custom:1.0", "early_stopping__medianstop": "custom:2.1"}' # noqa: E501 + } + ) + await ops_test.model.wait_for_idle( + apps=[CHARM_NAME], status="active", raise_on_blocked=True, timeout=300 + ) + katib_config_cm = lightkube_client.get( + ConfigMap, KATIB_CONFIG, namespace=ops_test.model_name + ) + trial_template_cm = lightkube_client.get( + ConfigMap, TRIAL_TEMPLATE, namespace=ops_test.model_name + ) + + assert katib_config_cm.data == EXPECTED_KATIB_CONFIG_CHANGED + assert trial_template_cm.data == EXPECTED_TRIAL_TEMPLATE_CHANGED + + async def test_blocked_on_invalid_config(self, ops_test: OpsTest): + await ops_test.model.applications[CHARM_NAME].set_config({"custom_images": "{"}) + await ops_test.model.wait_for_idle( + apps=[CHARM_NAME], status="blocked", raise_on_blocked=False, timeout=300 + ) + assert ops_test.model.applications[CHARM_NAME].units[0].workload_status == "blocked" diff --git a/charms/katib-controller/tox.ini b/charms/katib-controller/tox.ini index f79f14b8..37398985 100644 --- a/charms/katib-controller/tox.ini +++ b/charms/katib-controller/tox.ini @@ -10,8 +10,9 @@ skip_missing_interpreters = True envlist = fmt, lint, unit, integration [vars] -all_path = {[vars]src_path} +all_path = {[vars]src_path} {[vars]tst_path} src_path = {toxinidir}/src/ +tst_path = {toxinidir}/tests/ [testenv] passenv = @@ -40,8 +41,8 @@ description = Update requirements files by executing pip-compile on all requirem [testenv:fmt] commands = - isort {[vars]src_path} - black {[vars]src_path} + isort {[vars]all_path} + black {[vars]all_path} deps = -r requirements-fmt.txt description = Apply coding style standards to code @@ -60,4 +61,11 @@ commands = black --check --diff {[vars]all_path} deps = -r requirements-lint.txt -description = Check code against coding style standards \ No newline at end of file +description = Check code against coding style standards + +[testenv:integration] +commands = + pytest -v --tb native --asyncio-mode=auto {[vars]tst_path}integration/test_charm.py --log-cli-level=INFO -s {posargs} +deps = + -r requirements-integration.txt +description = Run integration tests \ No newline at end of file diff --git a/charms/katib-db-manager/requirements.in b/charms/katib-db-manager/requirements.in index 22332980..99380c70 100644 --- a/charms/katib-db-manager/requirements.in +++ b/charms/katib-db-manager/requirements.in @@ -1,5 +1,7 @@ ops>=2.2.0 oci-image +# Pinning until this resolves https://github.com/yaml/pyyaml/issues/724 +pyyaml==6.0.1 charmed-kubeflow-chisme>=0.0.8 lightkube lightkube-models>=1.25.4 diff --git a/charms/katib-db-manager/requirements.txt b/charms/katib-db-manager/requirements.txt index 304d9caa..e135229e 100644 --- a/charms/katib-db-manager/requirements.txt +++ b/charms/katib-db-manager/requirements.txt @@ -46,8 +46,9 @@ ops==2.3.0 # charmed-kubeflow-chisme ordered-set==4.1.0 # via deepdiff -pyyaml==6.0 +pyyaml==6.0.1 # via + # -r requirements.in # lightkube # ops ruamel-yaml==0.17.31 diff --git a/charms/katib-ui/requirements.in b/charms/katib-ui/requirements.in index c44a667c..55d788f4 100644 --- a/charms/katib-ui/requirements.in +++ b/charms/katib-ui/requirements.in @@ -3,4 +3,6 @@ oci-image serialized-data-interface lightkube jinja2 +# Pinning until this resolves https://github.com/yaml/pyyaml/issues/724 +pyyaml==6.0.1 charmed-kubeflow-chisme \ No newline at end of file diff --git a/charms/katib-ui/requirements.txt b/charms/katib-ui/requirements.txt index ccf8c598..5ed5c39c 100644 --- a/charms/katib-ui/requirements.txt +++ b/charms/katib-ui/requirements.txt @@ -61,8 +61,9 @@ pkgutil-resolve-name==1.3.10 # via jsonschema pyrsistent==0.19.3 # via jsonschema -pyyaml==6.0 +pyyaml==6.0.1 # via + # -r requirements.in # lightkube # ops # serialized-data-interface diff --git a/tests/integration/test_charms.py b/tests/integration/test_charms.py index fbaf73e8..70242fb9 100644 --- a/tests/integration/test_charms.py +++ b/tests/integration/test_charms.py @@ -30,7 +30,6 @@ @pytest.mark.skip_if_deployed @pytest.mark.abort_on_fail async def test_deploy_katib_charms(ops_test: OpsTest): - # Build katib-controller, katib-db-manager, and katib-ui charms controller_charm = await ops_test.build_charm(CONTROLLER_PATH) db_manager_charm = await ops_test.build_charm(DB_PATH) @@ -44,7 +43,9 @@ async def test_deploy_katib_charms(ops_test: OpsTest): # Deploy katib-controller, katib-db-manager, and katib-ui charms await ops_test.model.deploy(controller_charm, resources={"oci-image": controller_image_path}) - await ops_test.model.deploy(db_manager_charm, resources={"oci-image": db_image_path}, trust=True) + await ops_test.model.deploy( + db_manager_charm, resources={"oci-image": db_image_path}, trust=True + ) await ops_test.model.deploy(ui_charm, resources={"oci-image": ui_image_path}, trust=True) @@ -73,7 +74,6 @@ async def test_deploy_katib_charms(ops_test: OpsTest): async def test_create_experiment(ops_test: OpsTest): - namespace = ops_test.model_name lightkube_client = lightkube.Client() diff --git a/tox.ini b/tox.ini index facd9280..bb33d1e9 100644 --- a/tox.ini +++ b/tox.ini @@ -7,22 +7,24 @@ max-line-length = 100 [tox] skipsdist = True skip_missing_interpreters = True -envlist = fmt, lint, unit, integration +envlist = fmt, lint, unit, integration, {katib-controller}-{lint,unit,integration} [vars] all_path = {[vars]tst_path} tst_path = {toxinidir}/tests/ [testenv] -passenv = - PYTHONPATH - CHARM_BUILD_DIR - MODEL_SETTINGS - KUBECONFIG +allowlist_externals = + tox setenv = - PYTHONPATH = {toxinidir}:{toxinidir}/lib:{[vars]tst_path} - PYTHONBREAKPOINT=ipdb.set_trace - PY_COLORS=1 + controller: CHARM = controller + db-manager: CHARM = db-manager + ui: CHARM = ui + unit: TYPE = unit + lint: TYPE = lint + integration: TYPE = integration +commands = + tox -c charms/katib-{env:CHARM} -e {env:TYPE} [testenv:update-requirements] allowlist_externals =