Skip to content

Commit

Permalink
feat: Add kfp-tensorflow notebook to confirm NVIDIA GPU access
Browse files Browse the repository at this point in the history
Add `kfp-tensorflow` notebook in order to confirm access to a GPU. The
notebook spins uses kfp SDK to create an experiment and a run that
succeeds when:
 * the run's pod is scheduled on a node with an NVIDIA GPU
 * the run's code, and more specifically Tensorflow framework, has
   access to an NVIDIA GPU.
  • Loading branch information
orfeas-k committed Nov 28, 2024
1 parent fe35a1b commit af73b9b
Show file tree
Hide file tree
Showing 3 changed files with 317 additions and 0 deletions.
205 changes: 205 additions & 0 deletions tests/notebooks/gpu/kfp-tensorflow/kfp-tensorflow-integration.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test KFP Integration\n",
"\n",
"- create an experiment\n",
"- create a run\n",
"- check that the run passes. This happens only when both of the following are true:\n",
" * the run's pod is scheduled on a node with an NVIDIA GPU\n",
" * the code, and more specifically Tensorflow framework, has access to a GPU"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Please check the requirements.in file for more details\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp\n",
"import os\n",
"\n",
"from kfp import dsl\n",
"from tenacity import retry, stop_after_attempt, wait_exponential"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client = kfp.Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"EXPERIMENT_NAME = 'Check access to GPU'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"HTTP_PROXY = HTTPS_PROXY = NO_PROXY = None\n",
"\n",
"if os.environ.get('HTTP_PROXY') and os.environ.get('HTTPS_PROXY') and os.environ.get('NO_PROXY'):\n",
" HTTP_PROXY = os.environ['HTTP_PROXY']\n",
" HTTPS_PROXY = os.environ['HTTPS_PROXY']\n",
" NO_PROXY = os.environ['NO_PROXY']\n",
"\n",
"def add_proxy(obj, http_proxy=HTTP_PROXY, https_proxy=HTTPS_PROXY, no_proxy=NO_PROXY):\n",
" \"\"\"Adds the proxy env vars to the PipelineTask object.\"\"\"\n",
" return (\n",
" obj.set_env_variable(name='http_proxy', value=http_proxy)\n",
" .set_env_variable(name='https_proxy', value=https_proxy)\n",
" .set_env_variable(name='HTTP_PROXY', value=http_proxy)\n",
" .set_env_variable(name='HTTPS_PROXY', value=https_proxy)\n",
" .set_env_variable(name='no_proxy', value=no_proxy)\n",
" .set_env_variable(name='NO_PROXY', value=no_proxy)\n",
" )\n",
"\n",
"def proxy_envs_set():\n",
" \"\"\"Check if the proxy env vars are set\"\"\"\n",
" if HTTP_PROXY and HTTPS_PROXY and NO_PROXY:\n",
" return True\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@dsl.component(base_image=\"kubeflownotebookswg/jupyter-tensorflow-cuda:v1.9.0\")\n",
"def gpu_check() -> str:\n",
" \"\"\"Check access to a GPU.\"\"\"\n",
" import tensorflow as tf\n",
"\n",
" gpus = tf.config.list_physical_devices('GPU')\n",
" print(\"GPU list:\", gpus)\n",
" if not gpus:\n",
" raise RuntimeError(\"No GPU has been detected.\")\n",
" return str(len(gpus)>0)\n",
"\n",
"def add_gpu_request(obj):\n",
" \"\"\"Add a request field for a GPU to the container created by the PipelineTask object.\"\"\"\n",
" return ( obj.add_node_selector_constraint(accelerator = \"nvidia.com/gpu\").set_accelerator_limit(limit = 1) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@dsl.pipeline\n",
"def gpu_check_pipeline() -> str:\n",
" \"\"\"Create a pipeline that runs code to check access to a GPU.\"\"\"\n",
" gpu_check1 = add_gpu_request(gpu_check())\n",
" return gpu_check1.output\n",
"\n",
"@dsl.pipeline\n",
"def gpu_check_pipeline_proxy() -> str:\n",
" \"\"\"Create a pipeline that runs code to check access to a GPU and sets the appropriate proxy ENV variables.\"\"\"\n",
" gpu_check1 = add_proxy(add_gpu_request(gpu_check()))\n",
" return gpu_check1.output"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Setting enable_caching to False to overcome https://github.com/canonical/bundle-kubeflow/issues/1067\n",
"if proxy_envs_set():\n",
" run = client.create_run_from_pipeline_func(\n",
" gpu_check_pipeline_proxy,\n",
" experiment_name=EXPERIMENT_NAME,\n",
" enable_caching=False,\n",
" )\n",
"else:\n",
" run = client.create_run_from_pipeline_func(\n",
" gpu_check_pipeline,\n",
" experiment_name=EXPERIMENT_NAME,\n",
" enable_caching=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.list_experiments().experiments"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.get_run(run.run_id).state"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@retry(\n",
" wait=wait_exponential(multiplier=2, min=1, max=10),\n",
" stop=stop_after_attempt(30),\n",
" reraise=True,\n",
")\n",
"def assert_run_succeeded(client, run_id):\n",
" \"\"\"Wait for the run to complete successfully.\"\"\"\n",
" status = client.get_run(run_id).state\n",
" assert status == \"SUCCEEDED\", f\"KFP run in {status} state.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# fetch KFP experiment to ensure it exists\n",
"client.get_experiment(experiment_name=EXPERIMENT_NAME)\n",
"\n",
"assert_run_succeeded(client, run.run_id)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 2 additions & 0 deletions tests/notebooks/gpu/kfp-tensorflow/requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
kfp>=2.4,<3.0
tenacity
110 changes: 110 additions & 0 deletions tests/notebooks/gpu/kfp-tensorflow/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements.in
#
cachetools==5.5.0
# via google-auth
certifi==2024.8.30
# via
# kfp-server-api
# kubernetes
# requests
charset-normalizer==3.4.0
# via requests
click==8.1.7
# via kfp
docstring-parser==0.16
# via kfp
google-api-core==2.23.0
# via
# google-cloud-core
# google-cloud-storage
# kfp
google-auth==2.36.0
# via
# google-api-core
# google-cloud-core
# google-cloud-storage
# kfp
# kubernetes
google-cloud-core==2.4.1
# via google-cloud-storage
google-cloud-storage==2.18.2
# via kfp
google-crc32c==1.6.0
# via
# google-cloud-storage
# google-resumable-media
google-resumable-media==2.7.2
# via google-cloud-storage
googleapis-common-protos==1.66.0
# via google-api-core
idna==3.10
# via requests
kfp==2.10.1
# via -r requirements.in
kfp-pipeline-spec==0.5.0
# via kfp
kfp-server-api==2.3.0
# via kfp
kubernetes==30.1.0
# via kfp
oauthlib==3.2.2
# via
# kubernetes
# requests-oauthlib
proto-plus==1.25.0
# via google-api-core
protobuf==4.25.5
# via
# google-api-core
# googleapis-common-protos
# kfp
# kfp-pipeline-spec
# proto-plus
pyasn1==0.6.1
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.4.1
# via google-auth
python-dateutil==2.9.0.post0
# via
# kfp-server-api
# kubernetes
pyyaml==6.0.2
# via
# kfp
# kubernetes
requests==2.32.3
# via
# google-api-core
# google-cloud-storage
# kubernetes
# requests-oauthlib
# requests-toolbelt
requests-oauthlib==2.0.0
# via kubernetes
requests-toolbelt==0.10.1
# via kfp
rsa==4.9
# via google-auth
six==1.16.0
# via
# kfp-server-api
# kubernetes
# python-dateutil
tabulate==0.9.0
# via kfp
tenacity==9.0.0
# via -r requirements.in
urllib3==1.26.20
# via
# kfp
# kfp-server-api
# kubernetes
# requests
websocket-client==1.8.0
# via kubernetes

0 comments on commit af73b9b

Please sign in to comment.