diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 82e8c09da..9324ead7e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -62,7 +62,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - - name: Test KFP lib + - name: Test KFP v1 lib run: | source kind/requirements.env export PATH=$PATH:/tmp/ @@ -93,7 +93,7 @@ jobs: sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true df -h - - name: Test KFP worflow run + - name: Test KFP v1 worflow run timeout-minutes: 120 run: | source kind/requirements.env @@ -108,6 +108,8 @@ jobs: chmod 777 /tmp/kubectl curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 make -C kind setup - make -C transforms workflow-build + make -C kfp/kfp_support_lib test + make -C transforms/universal/noop/ workflow-build make -C transforms/universal/noop workflow-test diff --git a/.make.defaults b/.make.defaults index 276fa35c9..2cc29b6e5 100644 --- a/.make.defaults +++ b/.make.defaults @@ -53,6 +53,9 @@ KIND_CLUSTER_NAME=dataprep DPK_PYTHON_LIB_DIR=$(REPOROOT)/data-processing-lib/python DPK_RAY_LIB_DIR=$(REPOROOT)/data-processing-lib/ray DPK_SPARK_LIB_DIR=$(REPOROOT)/data-processing-lib/spark + +KFPv2?=0 + ####################################################################################### # Lists all targets and optional help text found in the target. # Adapted from https://stackoverflow.com/a/65243296/45375 @@ -200,7 +203,7 @@ __check_defined = \ cp -p -R ${LIB_PATH}/README.md ${LIB_NAME} # Build and image using the local Dockerfile and make the data-processing-lib/python -# available in the current directory for use by the Dockerfile (i.e. to install the library). +# available in the current directory for use by the Dockerfile (i.e. to install the library). .PHONY: .defaults.python-lib-src-image .defaults.python-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings. @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-lib/python source @@ -261,8 +264,8 @@ __check_defined = \ # Install all source from the repo for a python runtime transform into an existing venv .PHONY: .defaults.install-python-lib-src-venv -.defaults.install-python-lib-src-venv:: - @# Help: Install Python data processing library source into existing venv +.defaults.install-python-lib-src-venv:: + @# Help: Install Python data processing library source into existing venv @echo Installing Python data processing library source to existing venv @source venv/bin/activate; \ pip install pytest; \ @@ -276,8 +279,8 @@ __check_defined = \ # Install all source from the repo for a ray runtime transform into an existing venv .PHONY: .defaults.install-ray-lib-src-venv -.defaults.install-ray-lib-src-venv:: - @# Help: Install Ray and Python data processing library source into existing venv +.defaults.install-ray-lib-src-venv:: + @# Help: Install Ray and Python data processing library source into existing venv @echo Installing Ray and Python data processing library source to existing venv @source venv/bin/activate; \ pip install pytest; \ @@ -291,11 +294,10 @@ __check_defined = \ .PHONY: .defaults.spark-lib-src-venv .defaults.spark-lib-src-venv:: .defaults.create-venv .defaults.install-spark-lib-src-venv .defaults.install-local-requirements-venv -# Install all source from the repo for a spark runtime transform into an existing venv +# Install the python-based lib BEFORE spark assuming spark depends on the same version as python source. .PHONY: .defaults.install-spark-lib-src-venv -.defaults.install-spark-lib-src-venv:: - @# Help: Install Spark and Python data processing library source into existing venv - @echo "" +.defaults.install-spark-lib-src-venv:: + @# Help: Install Spark and Python data processing library source into existing venv @echo Installing Spark and Python data processing library source to existing venv @source venv/bin/activate; \ pip install pytest; \ diff --git a/.make.versions b/.make.versions index 08ce283f0..5ba577f4b 100644 --- a/.make.versions +++ b/.make.versions @@ -13,6 +13,7 @@ RELEASE_VERSION_SUFFIX=.dev6 # Data prep lab wheel version DPK_LIB_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX) DPK_LIB_KFP_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX) +DPK_LIB_KFP_VERSION_v2=0.2.0$(RELEASE_VERSION_SUFFIX) # Begin transform versions/tags BLOCKLIST_VERSION=0.4.2$(RELEASE_VERSION_SUFFIX) diff --git a/kfp/doc/setup.md b/kfp/doc/setup.md index e4803e16b..84385ef0f 100644 --- a/kfp/doc/setup.md +++ b/kfp/doc/setup.md @@ -66,7 +66,14 @@ choose your OS system, and process according to "(Optional) Install the MinIO Cl ## Installation steps -You can create a Kind cluster with all required software installed using the following command: +Before installation, you have to deside which KFP version do you want to use. +In order to use KFP v2, please set the following environment variable: + +```shell +export KFPv2=1 +``` + +Now, you can create a Kind cluster with all required software installed using the following command: ```shell make setup diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 220702cbc..539d3cdf5 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -34,16 +34,16 @@ Note: the project and the explanation below are based on [KFPv1](https://www.kub * Pipeline wiring - definition of the sequence of invocation (with parameter passing) of participating components * Additional configuration -### Imports definition +### Imports definition ```python import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, ) from kubernetes import client as k8s_client ``` @@ -73,8 +73,8 @@ Ray cluster. For each step we have to define a component that will execute them: Note: here we are using shared components described in this [document](../kfp_ray_components/README.md) for `create_ray_op`, `execute_ray_jobs_op` and `cleanup_ray_op`, while `compute_exec_params_op` component is built inline, because it might differ significantly. For "simple" pipeline cases we can use the -[default implementation](../kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py), -while, for example for exact dedup, we are using a very [specialized one](../transform_workflows/universal/ededup/src/ededup_compute_execution_params.py). +[default implementation](../kfp_support_lib/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py), +while, for example for exact dedup, we are using a very [specialized one](../../transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py). ### Input parameters definition diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 69f9f0d67..a012640ec 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -1,25 +1,35 @@ FROM docker.io/rayproject/ray:2.9.3-py310 -ARG BUILD_DATE -ARG GIT_COMMIT - -LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT - # install libraries COPY requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt -# Copy and install data processing libraries +# Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . -COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ +COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . -COPY --chown=ray:users kfp_support_lib/ kfp_support_lib/ -RUN cd kfp_support_lib && pip install --no-cache-dir -e . +COPY --chown=ray:users python_apiserver_client python_apiserver_client/ +RUN cd python_apiserver_client && pip install --no-cache-dir -e . + +COPY --chown=ray:users workflow_support_lib workflow_support_lib/ +RUN cd workflow_support_lib && pip install --no-cache-dir -e . + +# overwriting the installation of old versions of pydantic +RUN pip install --no-cache-dir pydantic==2.6.3 + # remove credentials-containing file RUN rm requirements.txt # components COPY ./src /pipelines/component/src + +# Set environment +ENV KFP_v2=$KFP_v2 + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index a7743d2f3..e8a8c3adb 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -2,26 +2,39 @@ # # know where they are running from. REPOROOT=../.. +# Include the common rules. +# Use "make help" to see them. +include $(REPOROOT)/.make.defaults + IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed 's/=/:=/' | sed 's/^/export /' > makeenv") include makeenv -DOCKER_FILE=Dockerfile + +ifeq ($(KFPv2), 1) +DOCKER_IMAGE_NAME=kfp-data-processing_v2 +DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION_v2} +WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support +else DOCKER_IMAGE_NAME=kfp-data-processing DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION} +WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support +endif -# Include the common rules. -# Use "make help" to see them. -include $(REPOROOT)/.make.defaults + +#DOCKER_IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_VERSION} +DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) .PHONY: .lib-src-image .lib-src-image:: $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python - $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib LIB_NAME=kfp_support_lib + $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client LIB_NAME=python_apiserver_client + $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB) LIB_NAME=workflow_support_lib $(MAKE) .defaults.image rm -rf data-processing-lib-ray rm -rf data-processing-lib-python - rm -rf kfp_support_lib + rm -rf python_apiserver_client + rm -rf workflow_support_lib .PHONY: image image: Dockerfile requirements.txt @@ -34,11 +47,12 @@ set-versions:: reconcile-requirements .PHONY: reconcile-requirements reconcile-requirements: @# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION) - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml + # TODO remove it for KFPv2 + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml .PHONY: load-image load-image: diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index fd04cfbe6..6f402affa 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -6,7 +6,7 @@ inputs: - { name: run_id, type: String, description: "The KFP Run ID" } - { name: additional_params, type: String, description: "additional parameters" } # The component converts the dictionary to json string - - { name: exec_params, type: dict, description: "job parameters" } + - { name: exec_params, type: JsonObject, description: "job parameters" } - { name: exec_script_name, type: String, description: "transform script name" } - { name: server_url, type: String, default: "", description: "url of api server" } diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 9f17afed4..fe0700b33 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -8,7 +8,7 @@ inputs: - { name: server_url, type: String, default: "", description: "url of api server" } - { name: prefix, type: String, default: "", description: "prefix for extra credentials" } # The component converts the dictionary to json string - - { name: exec_params, type: dict, description: "job parameters" } + - { name: exec_params, type: JsonObject, description: "job parameters" } - { name: additional_params, type: String, description: "additional parameters" } implementation: diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index 190acf80b..a2b16d577 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -9,10 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - import sys - -from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs +from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs def start_ray_cluster( diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index fc5016b87..55cf2f34b 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -9,11 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - import sys - -from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs - +from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs # Cleans and shutdowns the Ray cluster def cleanup_ray_cluster( diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index 74d42df1a..173ccb06a 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -9,9 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - -from kfp_support.workflow_support.utils import KFPUtils, execute_ray_jobs - +from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs if __name__ == "__main__": import argparse diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 1e58a5e66..b7b5d9863 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -10,8 +10,7 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.utils import KFPUtils, execute_ray_jobs - +from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs if __name__ == "__main__": import argparse diff --git a/kfp/kfp_ray_components/src/subworkflow.py b/kfp/kfp_ray_components/src/subworkflow.py index 52f8c0da4..f15877d86 100644 --- a/kfp/kfp_ray_components/src/subworkflow.py +++ b/kfp/kfp_ray_components/src/subworkflow.py @@ -1,9 +1,11 @@ import sys -from data_processing.utils.params_utils import ParamsUtils -from kfp_support.workflow_support.utils import KFPUtils, PipelinesUtils +from workflow_support.runtime_utils import KFPUtils +from workflow_support.pipeline_utils import PipelinesUtils +from data_processing.utils import ParamsUtils + def invoke_sub_workflow( name: str, # workflow name prefix: str, # workflow arguments prefix diff --git a/kfp/kfp_support_lib/Makefile b/kfp/kfp_support_lib/Makefile index aad549c7a..31f702221 100644 --- a/kfp/kfp_support_lib/Makefile +++ b/kfp/kfp_support_lib/Makefile @@ -1,54 +1,48 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../.. -include ${REPOROOT}/.make.versions -include ${REPOROOT}/kfp/requirements.env +################################################################################################################# +# +# This is the top level makefile, which is intended to be able to process a common set of rules on all +# sub-projects underneath this directory. Currently, the common/standardized set of rules are as follows +# and supported by .make.defaults +# +# setup: +# clean: +# build: +# test: +# +# When finally getting to a makefile that requires a rule implementation, for example to test the build, +# that makefile should override/implement the rule to meet its needs. Such a rule may continue to recurse +# using "$(MAKE) -recurse", for example "$(MAKE) test-recurse". +# +# Each rule is called recursively on sub-directories and if a similar inclusion is done in the sub-Makefiles, +# the rules will be applied/executed recursively in their sub-directories. +# +################################################################################################################# -# Include the common rules. -# Use "make help" to see them. -include ../../.make.defaults +REPOROOT=../.. -# Command to run pytest -PYTHON_VERSION=$(shell $(PYTHON) --version) -VENV_ACTIVATE=venv/bin/activate +# Get some common rules for the whole repo +include $(REPOROOT)/.make.defaults -DEPLOY_KUBEFLOW ?= 0 +########## ########## ########## ########## ########## ########## ########## ########## +# Global rules that are generally to be implemented in the sub-directories and can +# be overridden there (the double colon on the rule makes the overridable). clean:: - @# Help: Clean up the distribution build and the venv - rm -r dist venv || true - rm -rf src/*egg-info || true - rm -rf *.back || true - - -.check-env:: .check_python_version - @echo "Checks passed" - -set-versions:: .check-env - $(MAKE) TOML_VERSION=$(DPK_LIB_KFP_VERSION) .defaults.update-toml - sed -i.back 's/kfp==[0-9].*/kfp==${KFP}",/' pyproject.toml - sed -i.back 's/ray==[0-9].*/ray==${RAY}",/' pyproject.toml - -build-dist:: set-versions .defaults.build-dist - -publish:: publish-dist + @# Help: Recursively $@ in all subdirs + $(MAKE) RULE=$@ .recurse -publish-dist:: .check-env .defaults.publish-dist +setup:: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse -build:: build-dist +build:: + @# Help: Recursively $@ in all subdirs + $(MAKE) RULE=$@ .recurse -venv:: pyproject.toml .check-env .defaults.venv - $(MAKE) .defaults.install-python-lib-src-venv - . ${VENV_ACTIVATE}; \ - pip install -e .; \ - pip install pytest pytest-cov; - @# Help: Create the virtual environment using pyproject.toml +test:: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse -test:: venv - @# Help: Use the already-built virtual environment to run pytest on the test directory. - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) api_params_test.py; -ifeq ($(DEPLOY_KUBEFLOW),1) - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; -endif +image:: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse diff --git a/kfp/kfp_support_lib/README.md b/kfp/kfp_support_lib/README.md index 86f3f4360..440fc16c3 100644 --- a/kfp/kfp_support_lib/README.md +++ b/kfp/kfp_support_lib/README.md @@ -1,10 +1,13 @@ # KFP support library This provides support for implementing KFP pipelines automating transform's execution. -It comprises 2 main modules +It comprises 3 main modules -* [api server client](src/kfp_support/api_server_client/README.md) -* [workflow support](src/kfp_support/workflow_support/README.md) +* [api server client](python_apiserver_client/README.md) +* [kfp_v1_workflow_support](kfp_v1_workflow_support//README.md) +* [kfp_v2_workflow_support](kfp_v2_workflow_support//README.md) + +Depends on the using KFV version either `kfp_v1_workflow_support` or `kfp_v2_workflow_support` should be used. ## Development diff --git a/kfp/kfp_support_lib/doc/kfp_support_library.md b/kfp/kfp_support_lib/doc/kfp_support_library.md index 0ae5e9d1c..fc571eb81 100644 --- a/kfp/kfp_support_lib/doc/kfp_support_library.md +++ b/kfp/kfp_support_lib/doc/kfp_support_library.md @@ -2,7 +2,7 @@ This library is aimed to simplify transform pipelines implementations and consists of 2 main parts: -* [API Server Client](../src/kfp_support/api_server_client/README.md) +* [API Server Client](../python_apiserver_client/README.md) * [workflow support](../src/kfp_support/workflow_support/README.md) See also how this library is used for [kfp components](../../kfp_ray_components/README.md) implementation diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile new file mode 100644 index 000000000..9cebae629 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -0,0 +1,82 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +include ${REPOROOT}/.make.versions +include ${REPOROOT}/kfp/requirements.env + +# Include the common rules. +# Use "make help" to see them. +include ${REPOROOT}/.make.defaults + +# Command to run pytest +PYTHON_VERSION=$(shell $(PYTHON) --version) +VENV_ACTIVATE=venv/bin/activate + +DEPLOY_KUBEFLOW ?= 0 + +clean:: + @# Help: Clean up the distribution build and the venv + rm -r dist venv || true + rm -rf src/*egg-info || true + rm -rf *.back || true + + +.check-env:: .check_python_version + @echo "Checks passed" + +set-versions:: .check-env + @# Help: Copy the Makefile distribution version into the pyproject.toml + sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml + sed -i.back 's/data_prep_toolkit_ray==[0-9].*/data_prep_toolkit_ray==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v1}",/' pyproject.toml + sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml + +build:: set-versions venv +ifeq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make build` in upper directories and KFPv2==1 + echo "Skipping build as KFPv2 is defined" +else + @# Help: Build the distribution for publishing to a pypi + rm -r dist || true + rm -rf src/*egg-info || true + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m build +endif + +publish:: .check-env + @# Help: Publish the wheel to testpypi + if [ -d "dist"]; then rm -r dist; fi + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m twine check dist/* + ${PYTHON} -m twine upload --verbose --non-interactive dist/* + +venv:: pyproject.toml .check-env +ifeq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make venv` in upper directories and KFPv2==1 + echo "Skipping as KFPv2 is defined" +else + @# Help: Create the virtual environment using pyproject.toml + rm -rf venv + $(PYTHON) -m venv venv + . ${VENV_ACTIVATE}; \ + cd ../../../data-processing-lib/python && make set-versions && cd -; \ + pip install -e ../../../data-processing-lib/python; \ + cd ../../../data-prepossesing-lib/ray && make set-versions && cd -; \ + pip install -e ../../../data-processing-lib/ray; \ + cd ../python_apiserver_client && make set-versions && cd -; \ + pip install -e ../python_apiserver_client; \ + pip install -e .; \ + pip install pytest pytest-cov +endif + +test:: venv +ifeq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make test` in upper directories and KFPv2==1 + echo "Skipping test as KFPv2 is defined" +else + @# Help: Use the already-built virtual environment to run pytest on the test directory. +ifeq ($(DEPLOY_KUBEFLOW),1) + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; +endif +endif diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/README.md b/kfp/kfp_support_lib/kfp_v1_workflow_support/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/kfp/kfp_support_lib/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml similarity index 86% rename from kfp/kfp_support_lib/pyproject.toml rename to kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index 7816cc5ba..679f7ed08 100644 --- a/kfp/kfp_support_lib/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,7 +1,7 @@ [project] -name = "data_prep_toolkit_kfp" +name = "data_prep_toolkit_kfp_v1" version = "0.2.0.dev6" -requires-python = ">=3.10" +requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -15,7 +15,8 @@ dependencies = [ "kfp==1.8.22", "ray==2.9.3", "requests", - "data-prep-toolkit==0.2.0.dev6", + "data_prep_toolkit_ray==0.2.0.dev6", + "python_apiserver_client==0.1.0", ] [build-system] @@ -37,7 +38,8 @@ dev = [ package_dir = ["src"] [options.packages.find] -where = ["src/kfp_support"] +where = ["src/workflow_support"] + [tool.pytest.ini_options] addopts = "--cov --cov-report term-missing --cov-fail-under 10" diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py new file mode 100644 index 000000000..6b99a6be1 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py @@ -0,0 +1,6 @@ +from workflow_support.compile_utils.component import ( + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, + ComponentUtils +) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py similarity index 63% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py index 46e55024d..460b20e23 100644 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py @@ -92,50 +92,3 @@ def add_secret_volume_to_com_function(component: dsl.ContainerOp, secretName: st secret=k8s_client.V1SecretVolumeSource(secret_name=secretName, optional=optional), ) component.add_pvolumes({mountPoint: vol}) - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import GB, get_logger - from kfp_support.workflow_support.utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/__init__.py new file mode 100644 index 000000000..0e80d97a2 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/__init__.py @@ -0,0 +1 @@ +from workflow_support.pipeline_utils.pipeline_utils import PipelinesUtils diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipeline_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py new file mode 100644 index 000000000..183331a2b --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -0,0 +1,75 @@ +import os +import sys + +from data_processing.utils import get_logger, str2bool + +from workflow_support.pipeline_utils import PipelinesUtils + + +logger = get_logger(__name__) + + +def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/", overwrite: bool = True): + """ + Upload and run a single pipeline + + :param pipeline_package_path: Local path to the pipeline package. + :param endpoint: endpoint to kfp service. + :return the pipeline name as it appears in the kfp GUI. + """ + tmout: int = 800 + wait: int = 60 + file_name = os.path.basename(pipeline_package_path) + pipeline_name = os.path.splitext(file_name)[0] + utils = PipelinesUtils(host=endpoint) + pipeline = utils.upload_pipeline( + pipeline_package_path=pipeline_package_path, + pipeline_name=pipeline_name, + overwrite=overwrite, + ) + if pipeline is None: + return None + experiment = utils.get_experiment_by_name() + run_id = utils.start_pipeline(pipeline, experiment, params=[]) + status, error = utils.wait_pipeline_completion(run_id=run_id, timeout=tmout, wait=wait) + if status.lower() not in ["succeeded", "completed"]: + # Execution failed + logger.warning(f"Pipeline {pipeline_name} failed with error {error} and status {status}") + return None + logger.info(f"Pipeline {pipeline_name} successfully completed") + return pipeline_name + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Run sanity test") + parser.add_argument("-c", "--command", type=str, choices=["upload", "sanity-test"]) + parser.add_argument("-e", "--endpoint", type=str, default="http://localhost:8080/") + parser.add_argument("-p", "--pipeline_package_path", type=str, default="") + parser.add_argument("-o", "--overwrite", type=str, default="True") + + args = parser.parse_args() + match args.command: + case "upload": + file_name = os.path.basename(args.pipeline_package_path) + pipeline_name = os.path.splitext(file_name)[0] + utils = PipelinesUtils(host=args.endpoint) + pipeline = utils.upload_pipeline( + pipeline_package_path=args.pipeline_package_path, + pipeline_name=pipeline_name, + overwrite=str2bool(args.overwrite), + ) + if pipeline is None: + sys.exit(1) + case "sanity-test": + run = run_test( + endpoint=args.endpoint, + pipeline_package_path=args.pipeline_package_path, + overwrite=str2bool(args.overwrite), + ) + if run is None: + sys.exit(1) + case _: + logger.warning("Unsupported command") + exit(1) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/__init__.py new file mode 100644 index 000000000..8d2cdd648 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/__init__.py @@ -0,0 +1,2 @@ +from workflow_support.runtime_utils.kfp_utils import KFPUtils +from workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py similarity index 66% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/kfp_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py index ef00b0e92..feb081dd2 100644 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/kfp_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py @@ -111,3 +111,50 @@ def load_from_json(js: str) -> dict[str, Any]: except Exception as e: logger.warning(f"Failed to load parameters {js} with error {e}") sys.exit(1) + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import GB, get_logger + from workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py similarity index 99% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py index 40b26c7a1..0b20b28c4 100644 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py @@ -17,8 +17,8 @@ from data_processing.data_access import DataAccess, DataAccessFactory from data_processing.utils import ParamsUtils, get_logger -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( +from python_apiserver_client import KubeRayAPIs +from python_apiserver_client.params import ( DEFAULT_HEAD_START_PARAMS, DEFAULT_WORKER_START_PARAMS, Cluster, @@ -30,7 +30,7 @@ environment_variables_decoder, volume_decoder, ) -from kfp_support.workflow_support.utils import KFPUtils +from workflow_support.runtime_utils import KFPUtils from ray.job_submission import JobStatus diff --git a/kfp/kfp_support_lib/test/configmaps.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/configmaps.py similarity index 100% rename from kfp/kfp_support_lib/test/configmaps.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/test/configmaps.py diff --git a/kfp/kfp_support_lib/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py similarity index 90% rename from kfp/kfp_support_lib/test/pipeline_utils_test.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py index 2630552ee..200bf1676 100644 --- a/kfp/kfp_support_lib/test/pipeline_utils_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py @@ -10,14 +10,15 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.utils import PipelinesUtils +from workflow_support.pipeline_utils import PipelinesUtils +server_url = "http://localhost:8080/" def test_pipelines(): """ Test pipelines utils """ - utils = PipelinesUtils(host="http://localhost:8080/kfp") + utils = PipelinesUtils(host=server_url) # get pipeline by name pipeline = utils.get_pipeline_by_name("[Tutorial] Data passing in python components") assert pipeline is not None diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py new file mode 100644 index 000000000..ab25573b0 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py @@ -0,0 +1,91 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from configmaps import ConfigmapsManager +from python_apiserver_client.params import ConfigMapVolume +from workflow_support.runtime_utils import RayRemoteJobs + +server_url = "http://localhost:8080/ray/" + +def test_ray_remote_jobs(): + """ + Test the full cycle of job submission + :return: + """ + # This shows how to create volumes dictionary + volumes = [ + ConfigMapVolume( + name="code-sample", + mount_path="/home/ray/samples", + source="ray-job-code-sample", + items={"sample_code.py": "sample_code.py"}, + ) + ] + dct_volumes = {"volumes": [v.to_dict() for v in volumes]} + + head_node = { + "cpu": 2, + "memory": 4, + "image": "rayproject/ray:2.9.3-py310", + # Ray start params, just to show + "ray_start_params": {"metrics-export-port": "8080", "num-cpus": "0", "dashboard-host": "0.0.0.0"}, + "image_pull_policy": "Always", + } | dct_volumes + + worker_node = { + "cpu": 2, + "memory": 4, + "image": "rayproject/ray:2.9.3-py310", + "replicas": 1, + "min_replicas": 1, + "max_replicas": 1, + "image_pull_policy": "Always", + } | dct_volumes + + # Create configmap for testing + cm_manager = ConfigmapsManager() + cm_manager.delete_code_map() + cm_manager.create_code_map() + + # create cluster + remote_jobs = RayRemoteJobs(server_url=server_url) + status, error = remote_jobs.create_ray_cluster( + name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] + ) + print(f"Created cluster - status: {status}, error: {error}") + assert status == 200 + assert error is None + # submitting ray job + runtime_env = """ + pip: + - requests==2.26.0 + - pendulum==2.1.2 + env_vars: + counter_name: test_counter + """ + status, error, submission = remote_jobs.submit_job( + name="job-test", + namespace="default", + request={}, + runtime_env=runtime_env, + executor="/home/ray/samples/sample_code.py", + ) + print(f"submit job - status: {status}, error: {error}, submission id {submission}") + assert status == 200 + assert error is None + # print execution log + remote_jobs.follow_execution(name="job-test", namespace="default", submission_id=submission, print_timeout=20) + # cleanup + status, error = remote_jobs.delete_ray_cluster(name="job-test", namespace="default") + print(f"Deleted cluster - status: {status}, error: {error}") + assert status == 200 + assert error is None diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile new file mode 100644 index 000000000..30921f37f --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -0,0 +1,83 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +include ${REPOROOT}/.make.versions +include ${REPOROOT}/kfp/requirements.env + +# Include the common rules. +# Use "make help" to see them. +include ${REPOROOT}/.make.defaults + +# Command to run pytest +PYTHON_VERSION=$(shell $(PYTHON) --version) +VENV_ACTIVATE=venv/bin/activate + +DEPLOY_KUBEFLOW ?= 0 + +clean:: + @# Help: Clean up the distribution build and the venv + rm -r dist venv || true + rm -rf src/*egg-info || true + rm -rf *.back || true + + +.check-env:: .check_python_version + @echo "Checks passed" + +set-versions:: .check-env + @# Help: Copy the Makefile distribution version into the pyproject.toml + sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION_v2}'"/' pyproject.toml + sed -i.back 's/data_prep_toolkit_ray==[0-9].*/data_prep_toolkit_ray==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v2}",/' pyproject.toml + sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml + +build:: set-versions venv +ifneq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make build` in upper directories and KFPv2 is not set + echo "Skipping build as KFPv2 is not defined" +else + @# Help: Build the distribution for publishing to a pypi + rm -r dist || true + rm -rf src/*egg-info || true + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m build +endif + +publish:: .check-env + @# Help: Publish the wheel to testpypi + if [ -d "dist"]; then rm -r dist; fi + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m twine check dist/* + ${PYTHON} -m twine upload --verbose --non-interactive dist/* + +venv:: pyproject.toml .check-env +ifneq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make venv` in upper directories and KFPv2 is not set + echo "Skipping venv as KFPv2 is not defined" +else + @# Help: Create the virtual environment using pyproject.toml + rm -rf venv + $(PYTHON) -m venv venv + . ${VENV_ACTIVATE}; \ + cd ../../../data-processing-lib/python && make set-versions && cd -; \ + pip install -e ../../../data-processing-lib/python; \ + cd ../../../data-prepossesing-lib/ray && make set-versions && cd -; \ + pip install -e ../../../data-processing-lib/ray; \ + cd ../python_apiserver_client && make set-versions && cd -; \ + pip install -e ../python_apiserver_client; \ + pip install -e .; \ + pip install pytest pytest-cov +endif + +test:: venv +ifneq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make test` in upper directories and KFPv2 is not set + echo "Skipping test as KFPv2 is not defined" +else + @# Help: Use the already-built virtual environment to run pytest on the test directory. +ifeq ($(DEPLOY_KUBEFLOW),1) + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; +endif +endif diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/README.md b/kfp/kfp_support_lib/kfp_v2_workflow_support/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml new file mode 100644 index 000000000..3e1607ee6 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "data_prep_toolkit_kfp_v2" +version = "0.2.0.dev6" +requires-python = ">=3.10,<3.12" +description = "Data Preparation Kit Library. KFP support" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Alexey Roytman", email = "roytman@il.ibm.com" }, + { name = "Mohammad Nassar", email = "Mohammad.Nassar@ibm.com" }, + { name = "Revital Eres", email = "eres@il.ibm.com" }, +] +dependencies = [ + "kfp==2.7.0", + "kfp-kubernetes==1.2.0", + "ray==2.9.3", + "requests", + "data_prep_toolkit_ray==0.2.0.dev6", + "python_apiserver_client", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", +] + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/workflow_support"] + +[tool.pytest.ini_options] +addopts = "--cov --cov-report term-missing --cov-fail-under 10" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py new file mode 100644 index 000000000..6b99a6be1 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py @@ -0,0 +1,6 @@ +from workflow_support.compile_utils.component import ( + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, + ComponentUtils +) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py new file mode 100644 index 000000000..4fa47290f --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py @@ -0,0 +1,58 @@ +import kfp.dsl as dsl +from kfp import kubernetes +from typing import Dict + +RUN_NAME = "KFP_RUN_NAME" + +ONE_HOUR_SEC = 60 * 60 +ONE_DAY_SEC = ONE_HOUR_SEC * 24 +ONE_WEEK_SEC = ONE_DAY_SEC * 7 + +class ComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + @staticmethod + def add_settings_to_component( + task: dsl.PipelineTask, + timeout: int, + image_pull_policy: str = "IfNotPresent", + cache_strategy: bool = False, + ) -> None: + """ + Add settings to kfp task + :param task: kfp task + :param timeout: timeout to set to the component in seconds + :param image_pull_policy: pull policy to set to the component + :param cache_strategy: cache strategy + """ + + kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # Set cashing + task.set_caching_options(enable_caching=cache_strategy) + # image pull policy + kubernetes.set_image_pull_policy(task, image_pull_policy) + # Set the timeout for the task to one day (in seconds) + kubernetes.set_timeout(task, seconds=timeout) + + @staticmethod + def set_s3_env_vars_to_component( + task: dsl.PipelineTask, + secret: str = '', + env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, + prefix: str = None, + ) -> None: + """ + Set S3 env variables to KFP component + :param task: kfp task + :param secret: secret name with the S3 credentials + :param env2key: dict with mapping each env variable to a key in the secret + :param prefix: prefix to add to env name + """ + + if prefix is not None: + for env_name, _ in env2key.items(): + env2key[prefix + "_" + env_name] = env2key.pop(env_name) + kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/__init__.py new file mode 100644 index 000000000..0e80d97a2 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/__init__.py @@ -0,0 +1 @@ +from workflow_support.pipeline_utils.pipeline_utils import PipelinesUtils diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py new file mode 100644 index 000000000..7566f6b2e --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py @@ -0,0 +1,173 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +import time +from typing import Any, Optional + +from data_processing.utils import get_logger +from kfp_server_api import models + +from kfp import Client + + +logger = get_logger(__name__) + + +class PipelinesUtils: + """ + Helper class for pipeline management + """ + + def __init__(self, host: str = "http://localhost:8080"): + """ + Initialization + :param host: host to connect to + """ + self.kfp_client = Client(host=host) + + def upload_pipeline( + self, + pipeline_package_path: str = None, + pipeline_name: str = None, + overwrite: bool = False, + description: str = None, + ) -> models.api_pipeline.ApiPipeline: + """ + Uploads the pipeline + :param pipeline_package_path: Local path to the pipeline package. + :param pipeline_name: Optional. Name of the pipeline to be shown in the UI + :param overwrite: Optional. If pipeline exists, delete it before creating a new one. + :param description: Optional. Description of the pipeline to be shown in the UI. + :return: Server response object containing pipeline id and other information. + """ + if overwrite: + pipeline = self.get_pipeline_by_name(name=pipeline_name) + if pipeline is not None: + try: + logger.info(f"pipeline {pipeline_name} already exists. Trying to delete it.") + self.kfp_client.delete_pipeline(pipeline_id=pipeline.id) + except Exception as e: + logger.warning(f"Exception deleting pipeline {e} before uploading") + return None + try: + pipeline = self.kfp_client.upload_pipeline( + pipeline_package_path=pipeline_package_path, pipeline_name=pipeline_name, description=description + ) + except Exception as e: + logger.warning(f"Exception uploading pipeline {e}") + return None + if pipeline is None: + logger.warning(f"Failed to upload pipeline {pipeline_name}.") + return None + logger.info("Pipeline uploaded") + return pipeline + + def delete_pipeline(self, pipeline_id): + """ + Delete pipeline. + :param pipeline_id: id of the pipeline. + :return + Returns: + Object. If the method is called asynchronously, returns the request thread. + Raises: + kfp_server_api.ApiException: If pipeline is not found. + """ + return self.kfp_client.delete_pipeline(pipeline_id) + + def start_pipeline( + self, + pipeline: models.api_pipeline.ApiPipeline, + experiment: models.api_experiment.ApiExperiment, + params: Optional[dict[str, Any]], + ) -> str: + """ + Start a specified pipeline. + :param pipeline: pipeline definition + :param experiment: experiment to use + :param params: pipeline parameters + :return: the id of the run object + """ + job_name = pipeline.name + " " + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + try: + run_id = self.kfp_client.run_pipeline( + experiment_id=experiment.id, job_name=job_name, pipeline_id=pipeline.id, params=params + ) + logger.info(f"Pipeline run {job_name} submitted") + return run_id.id + except Exception as e: + logger.warning(f"Exception starting pipeline {e}") + return None + + def get_experiment_by_name(self, name: str = "Default") -> models.api_experiment.ApiExperiment: + """ + Get experiment by name + :param name: name + :return: experiment + """ + try: + return self.kfp_client.get_experiment(experiment_name=name) + except Exception as e: + logger.warning(f"Exception getting experiment {e}") + return None + + def get_pipeline_by_name(self, name: str, np: int = 100) -> models.api_pipeline.ApiPipeline: + """ + Given pipeline name, return the pipeline + :param name: pipeline name + :param np: page size for pipeline query. For large clusters with many pipelines, you might need to + increase this number + :return: pipeline + """ + try: + # Get all pipelines + pipelines = self.kfp_client.list_pipelines(page_size=np).pipelines + required = list(filter(lambda p: name in p.name, pipelines)) + if len(required) != 1: + logger.warning(f"Failure to get pipeline. Number of pipelines with name {name} is {len(required)}") + return None + return required[0] + + except Exception as e: + logger.warning(f"Exception getting pipeline {e}") + return None + + def wait_pipeline_completion(self, run_id: str, timeout: int = -1, wait: int = 600) -> tuple[str, str]: + """ + Waits for a pipeline run to complete + :param run_id: run id + :param timeout: timeout (sec) (-1 wait forever) + :param wait: internal wait (sec) + :return: Completion status and an error message if such exists + """ + try: + if timeout > 0: + end = time.time() + timeout + else: + end = 2**63 - 1 + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + while status is None or status.lower() not in ["succeeded", "completed", "failed", "skipped", "error"]: + time.sleep(wait) + if (end - time.time()) < 0: + return "failed", f"Execution is taking too long" + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + logger.info(f"Got pipeline execution status {status}") + + if status.lower() in ["succeeded", "completed"]: + return status, "" + return status, run_details.run.error + + except Exception as e: + logger.warning(f"Failed waiting pipeline completion {e}") + return "failed", str(e) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py new file mode 100644 index 000000000..8d2cdd648 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py @@ -0,0 +1,2 @@ +from workflow_support.runtime_utils.kfp_utils import KFPUtils +from workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py new file mode 100644 index 000000000..0e9951282 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py @@ -0,0 +1,160 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import json +import os +import re +import sys +from typing import Any + +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class KFPUtils: + """ + Helper utilities for KFP implementations + """ + + @staticmethod + def credentials( + access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + ) -> tuple[str, str, str]: + """ + Get credentials from the environment + :param access_key: environment variable for access key + :param secret_key: environment variable for secret key + :param endpoint: environment variable for S3 endpoint + :return: + """ + s3_key = os.getenv(access_key, None) + s3_secret = os.getenv(secret_key, None) + s3_endpoint = os.getenv(endpoint, None) + if s3_key is None or s3_secret is None or s3_endpoint is None: + logger.warning("Failed to load s3 credentials") + return s3_key, s3_secret, s3_endpoint + + @staticmethod + def get_namespace() -> str: + """ + Get k8 namespace that we are running it + :return: + """ + ns = "" + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + except Exception as e: + logger.warning( + f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" + ) + else: + with file: + ns = file.read() + return ns + + @staticmethod + def runtime_name(ray_name: str = "", run_id: str = "") -> str: + """ + Get unique runtime name + :param ray_name: + :param run_id: + :return: runtime name + """ + # K8s objects cannot contain special characters, except '_', All characters should be in lower case. + if ray_name != "": + ray_name = ray_name.replace("_", "-").lower() + pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. + ray_name = re.sub(pattern, "", ray_name) + else: + ray_name = "a" + # the return value plus namespace name will be the name of the Ray Route, + # which length is restricted to 64 characters, + # therefore we restrict the return name by 15 character. + if run_id != "": + return f"{ray_name[:9]}-{run_id[:5]}" + return ray_name[:15] + + @staticmethod + def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: + res = f"python {executor} " + for key, value in d.items(): + if str(value) != "": + if isinstance(value, str): + if '"' in value: + logger.warning(f"can't parse inputs with double quotation marks, please use single quotation marks instead") + res += f'--{key}="{value}" ' + elif isinstance(value, bool): + if value: + res += f"--{key} " + else: + res += f"--{key}={value} " + + logger.info(f"request to execute: {res}") + return res + + # Load a string that represents a json to python dictionary + @staticmethod + def load_from_json(js: str) -> dict[str, Any]: + try: + return json.loads(js) + except Exception as e: + logger.warning(f"Failed to load parameters {js} with error {e}") + sys.exit(1) + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import GB, get_logger + from workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py new file mode 100644 index 000000000..0b20b28c4 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py @@ -0,0 +1,527 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import re +import sys +import time +from typing import Any + +from data_processing.data_access import DataAccess, DataAccessFactory +from data_processing.utils import ParamsUtils, get_logger +from python_apiserver_client import KubeRayAPIs +from python_apiserver_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + HeadNodeSpec, + RayJobRequest, + Template, + WorkerNodeSpec, + environment_variables_decoder, + volume_decoder, +) +from workflow_support.runtime_utils import KFPUtils +from ray.job_submission import JobStatus + + +logger = get_logger(__name__) + + +class RayRemoteJobs: + """ + class supporting Ray remote jobs + """ + + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + default_image: str = "rayproject/ray:2.9.3-py310", + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initialization + :param server_url: API server URL. Default value is assuming running inside the cluster + :param default_image - default Ray image + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.api_server_client = KubeRayAPIs( + server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + ) + self.default_image = default_image + + def create_ray_cluster( + self, + name: str, + namespace: str, + head_node: dict[str, Any], + worker_nodes: list[dict[str, Any]], + wait_cluster_ready: int = -1, + ) -> tuple[int, str]: + """ + Create Ray cluster + :param name: name, _ are not allowed in the name + :param namespace: namespace + :param head_node: head node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for head node + service_account - service account to use (has to be created) + environment - dictionary of head node environment + annotations: dictionary of head node annotation + labels: dictionary of head node labels + image_pull_policy: image pull policy, default IfNotPresent + + :param worker_nodes: an array of worker node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + max_replicas - max replicas for this worker group + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + replicas - number of replicas to create for this group (default 1) + min_replicas - min number of replicas for this group (default 0) + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for this group + service_account - service account to use (has to be created) + environment - dictionary of node of this group environment + annotations: dictionary of node of this group annotation + labels: dictionary of node of this group labels + image_pull_policy: image pull policy, default IfNotPresent + + :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # start with templates + # head_node + cpus = head_node.get("cpu", 1) + memory = head_node.get("memory", 1) + gpus = head_node.get("gpu", 0) + accelerator = head_node.get("gpu_accelerator", None) + head_node_template_name = f"{name}-head-template" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) + head_template = Template( + name=head_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(head_template) + if status != 200: + return status, error + worker_template_names = [""] * len(worker_nodes) + index = 0 + # For every worker group + for worker_node in worker_nodes: + cpus = worker_node.get("cpu", 1) + memory = worker_node.get("memory", 1) + gpus = worker_node.get("gpu", 0) + accelerator = worker_node.get("gpu_accelerator", None) + worker_node_template_name = f"{name}-worker-template-{index}" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) + worker_template = Template( + name=worker_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(worker_template) + if status != 200: + return status, error + worker_template_names[index] = worker_node_template_name + index += 1 + # Build head node spec + image = head_node.get("image", self.default_image) + image_pull_secret = head_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) + volumes_dict = head_node.get("volumes", None) + service_account = head_node.get("service_account", None) + environment_dict = head_node.get("environment", None) + annotations = head_node.get("annotations", None) + labels = head_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + head_node_spec = HeadNodeSpec( + compute_template=head_node_template_name, + image=image, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + # build worker nodes + worker_groups = [] + index = 0 + for worker_node in worker_nodes: + max_replicas = worker_node.get("max_replicas", 1) + replicas = worker_node.get("replicas", 1) + min_replicas = worker_node.get("min_replicas", 0) + image = worker_node.get("image", self.default_image) + image_pull_secret = worker_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) + volumes_dict = worker_node.get("volumes", None) + service_account = worker_node.get("service_account", None) + environment_dict = worker_node.get("environment", None) + annotations = worker_node.get("annotations", None) + labels = worker_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + worker_groups.append( + WorkerNodeSpec( + group_name=f"worker-group-{index}", + compute_template=worker_template_names[index], + image=image, + max_replicas=max_replicas, + replicas=replicas, + min_replicas=min_replicas, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + ) + index += 1 + # Build cluster spec + cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) + # Build cluster + cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + status, error = self.api_server_client.create_cluster(cluster) + if status != 200: + return status, error + # Wait for cluster ready + return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) + + def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: + """ + Clean up Ray cluster and supporting template + :param name: cluster name + :param namespace: cluster namespace + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # delete cluster + status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) + if status != 200: + return status, error + # clean up templates + status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) + if status != 200: + return status, error + for template in template_array: + if template.name.startswith(name): + status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) + if status != 200: + return status, error + return status, error + + def submit_job( + self, + name: str, + namespace: str, + request: dict[str, Any], + runtime_env: str = None, + executor: str = "transformer_launcher.py", + ) -> tuple[int, str, str]: + """ + Submit job for execution + :param name: cluster name + :param namespace: cluster namespace + :param request: dictionary of the remote job request + :param runtime_env: runtime environment string + :param executor: python file to execute + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id - submission id + """ + # Although the cluster is ready, the service web server might not be ready yet at this point. + # To ensure that it is ready, trying to get jobs info from the cluster. Even if it fails + # couple of times, its harmless + _, _, _ = self.api_server_client.list_job_info(ns=namespace, name=name) + time.sleep(5) + # Build job request + job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) + if runtime_env is not None: + job_request.runtime_env = runtime_env + return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) + + def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: + """ + Get job status + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + status - job status + """ + # get job info + status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + return status, error, "" + return status, error, info.status + + @staticmethod + def _print_log(log: str, previous_log_len: int) -> None: + """ + Prints the delta between current and previous logs + :param log: current log + :param previous_log_len: previous log length + :return: None + """ + l_to_print = log[previous_log_len:] + if len(l_to_print) > 0: + l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) + print(l_to_print) + + def follow_execution( + self, + name: str, + namespace: str, + submission_id: str, + data_access: DataAccess = None, + job_ready_timeout: int = 600, + print_timeout: int = 120, + ) -> None: + """ + Follow remote job execution + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :param data_access - data access class + :param job_ready_timeout: timeout to wait for fob to become ready + :param print_timeout: print interval + :return: None + """ + # Wait for job to start running + job_status = JobStatus.PENDING + while job_status != JobStatus.RUNNING and job_ready_timeout > 0: + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + break + time.sleep(self.api_server_client.wait_interval) + job_ready_timeout -= self.api_server_client.wait_interval + logger.info(f"job status is {job_status}") + if job_ready_timeout <= 0: + logger.warning("timed out waiting for job become ready, exiting") + sys.exit(1) + # While job is running print log + previous_log_len = 0 + # At this point job could succeeded, failed, stop or running. So print log regardless + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + # continue printing log, while job is running + while job_status == JobStatus.RUNNING: + time.sleep(print_timeout) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + # Print the final log and execution status + # Sleep here to avoid racing conditions + time.sleep(2) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + logger.info(f"Job completed with execution status {job_status}") + if job_status != JobStatus.SUCCEEDED: + sys.exit(1) + if data_access is None: + return + # Here data access is either S3 or lakehouse both of which contain self.output_folder + try: + output_folder = data_access.get_output_folder() + except Exception as e: + logger.warning(f"failed to get output folder {e}") + return + output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" + execution_log_path = f"{output_folder}execution.log" + logger.info(f"saving execution log to {execution_log_path}") + data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) + + +def _execute_remote_job( + name: str, + ns: str, + script: str, + params: dict[str, Any], + data_access_params: dict[str, Any], + additional_params: dict[str, Any], + remote_jobs: RayRemoteJobs, +) -> None: + """ + Execute remote job on Ray cluster + :param name: cluster name + :param ns: execution/cluster namespace + :param additional_params: additional parameters for the job + :param data_access_params: data access parameters + :param params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param script: script to run (has to be present in the image) + :param remote_jobs: remote jobs execution support class + :return: + """ + + status, error, submission = remote_jobs.submit_job(name=name, namespace=ns, request=params, executor=script) + if status != 200: + logger.error(f"Failed to submit job - status: {status}, error: {error}") + exit(1) + + logger.info(f"submitted job successfully, submission id {submission}") + # create data access + data_factory = DataAccessFactory() + data_factory.apply_input_params(args=data_access_params) + data_access = data_factory.create_data_access() + # print execution log + remote_jobs.follow_execution( + name=name, + namespace=ns, + submission_id=submission, + data_access=data_access, + print_timeout=additional_params.get("wait_print_tmout", 120), + job_ready_timeout=additional_params.get("wait_job_ready_tmout", 600), + ) + + +def execute_ray_jobs( + name: str, # name of Ray cluster + additional_params: dict[str, Any], + e_params: dict[str, Any], + exec_script_name: str, + server_url: str, +) -> None: + """ + Execute Ray jobs on a cluster periodically printing execution log. Completes when all Ray job complete. + All of the jobs will be executed, although some of the jobs may fail. + :param name: cluster name + :param additional_params: additional parameters for the job + :param e_params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param exec_script_name: script to run (has to be present in the image) + :param server_url: API server url + :return: None + """ + # prepare for execution + ns = KFPUtils.get_namespace() + if ns == "": + logger.warning(f"Failed to get namespace") + sys.exit(1) + # create remote jobs class + remote_jobs = RayRemoteJobs( + server_url=server_url, + http_retries=additional_params.get("http_retries", 5), + wait_interval=additional_params.get("wait_interval", 2), + ) + # find config parameter + config = ParamsUtils.get_config_parameter(e_params) + if config is None: + exit(1) + # get config value + config_value = KFPUtils.load_from_json(e_params[config].replace("'", '"')) + s3_creds = KFPUtils.load_from_json(e_params["data_s3_cred"].replace("'", '"')) + if type(config_value) is not list: + # single request + return _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: config_value, "data_s3_cred": s3_creds}, + params=e_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + # remove config key from the dictionary + launch_params = dict(e_params) + del launch_params[config] + # Loop through all configuration + n_launches = 0 + for conf in config_value: + # populate individual config and launch + launch_params[config] = ParamsUtils.convert_to_ast(d=conf) + try: + _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: conf, "data_s3_cred": s3_creds}, + params=launch_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + n_launches += 1 + except SystemExit: + logger.warning(f"Failed to execute job for configuration {conf}") + continue + + if n_launches == 0: + logger.warning("All executions failed") + sys.exit(1) + else: + logger.info(f"{n_launches} ot of {len(config_value)} succeeded") diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py new file mode 100644 index 000000000..200bf1676 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py @@ -0,0 +1,34 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from workflow_support.pipeline_utils import PipelinesUtils + +server_url = "http://localhost:8080/" + +def test_pipelines(): + """ + Test pipelines utils + """ + utils = PipelinesUtils(host=server_url) + # get pipeline by name + pipeline = utils.get_pipeline_by_name("[Tutorial] Data passing in python components") + assert pipeline is not None + # get default experiment + experiment = utils.get_experiment_by_name() + assert experiment is not None + # start pipeline + run = utils.start_pipeline(pipeline=pipeline, experiment=experiment, params={}) + assert run is not None + # wait for completion + status, error = utils.wait_pipeline_completion(run_id=run, wait=10) + assert status.lower() == "succeeded" + assert error == "" diff --git a/kfp/kfp_support_lib/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py similarity index 93% rename from kfp/kfp_support_lib/test/ray_remote_jobs_test.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py index 5ae76a5f5..f409550e9 100644 --- a/kfp/kfp_support_lib/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py @@ -11,9 +11,10 @@ ################################################################################ from configmaps import ConfigmapsManager -from kfp_support.api_server_client.params import ConfigMapVolume -from kfp_support.workflow_support.utils import RayRemoteJobs +from python_apiserver_client.params import ConfigMapVolume +from workflow_support.runtime_utils import RayRemoteJobs +server_url = "http:localhost:8080/ray/" def test_ray_remote_jobs(): """ @@ -56,7 +57,7 @@ def test_ray_remote_jobs(): cm_manager.create_code_map() # create cluster - remote_jobs = RayRemoteJobs(server_url="http://localhost:8080/ray") + remote_jobs = RayRemoteJobs(server_url=server_url) status, error = remote_jobs.create_ray_cluster( name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] ) diff --git a/kfp/kfp_support_lib/python_apiserver_client/.gitignore b/kfp/kfp_support_lib/python_apiserver_client/.gitignore new file mode 100644 index 000000000..3ff12a7a8 --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/.gitignore @@ -0,0 +1,32 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +htmlcov +.coverage +.cache +nosetests.xml +coverage.xml \ No newline at end of file diff --git a/kfp/kfp_support_lib/python_apiserver_client/Makefile b/kfp/kfp_support_lib/python_apiserver_client/Makefile new file mode 100644 index 000000000..642d475d1 --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/Makefile @@ -0,0 +1,62 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +include ${REPOROOT}/.make.versions +include ${REPOROOT}/kfp/requirements.env + +# Include the common rules. +# Use "make help" to see them. +include ../../../.make.defaults + +# Command to run pytest +PYTHON_VERSION=$(shell $(PYTHON) --version) +VENV_ACTIVATE=venv/bin/activate + +DEPLOY_KUBEFLOW ?= 0 + +clean:: + @# Help: Clean up the distribution build and the venv + rm -r dist venv || true + rm -rf src/*egg-info || true + rm -rf *.back || true + + +.check-env:: .check_python_version + @echo "Checks passed" + +set-versions:: .check-env + @# Help: Copy the Makefile distribution version into the pyproject.toml + sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml + +build:: set-versions venv + @# Help: Build the distribution for publishing to a pypi + rm -r dist || true + rm -rf src/*egg-info || true + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m build + +publish:: .check-env + @# Help: Publish the wheel to testpypi + if [ -d "dist"]; then rm -r dist; fi + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m twine check dist/* + ${PYTHON} -m twine upload --verbose --non-interactive dist/* + +venv::pyproject.toml .check-env + @# Help: Create the virtual environment using pyproject.toml + rm -rf venv + $(PYTHON) -m venv venv + . ${VENV_ACTIVATE}; \ + pip install --upgrade pip; \ + cd ../../../data-processing-lib/python && make set-versions && cd -; \ + pip install -e ../../../data-processing-lib/python; \ + pip install -e .; \ + pip install pytest pytest-cov + +test:: venv + @# Help: Use the already-built virtual environment to run pytest on the test directory. + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) api_params_test.py; +ifeq ($(DEPLOY_KUBEFLOW),1) + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; +endif + diff --git a/kfp/kfp_support_lib/python_apiserver_client/README.md b/kfp/kfp_support_lib/python_apiserver_client/README.md new file mode 100644 index 000000000..de489adcd --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/README.md @@ -0,0 +1,4 @@ +# KubeRay API server APIs + +This is a copy of [Kuberay API server-client python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) +Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml new file mode 100644 index 000000000..a933f1bbc --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml @@ -0,0 +1,28 @@ +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" +[options] +package_dir = ["src"] +[project] +name = "python_apiserver_client" +version = "0.1.0" +dependencies = [ + "requests", + "kubernetes", + "data-prep-toolkit==0.2.0.dev6", +] +authors = [ + { name="KubeRay project"}, +] +description = "A Kuberay python client library to manage clusters based on the KubeRay API server" +readme = {file = "README.md", content-type = "text/markdown"} +license = {text = "Apache-2.0"} +requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: Apache License 2.0", + "Operating System :: OS Independent", +] + +[project.urls] +"Homepage" = "https://github.com/ray-project/kuberay" \ No newline at end of file diff --git a/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/__init__.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/__init__.py new file mode 100644 index 000000000..e6cdbec9a --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/__init__.py @@ -0,0 +1 @@ +from python_apiserver_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/kuberay_apis.py similarity index 99% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/kuberay_apis.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/kuberay_apis.py index 270815e77..9051e7c73 100644 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/kuberay_apis.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/kuberay_apis.py @@ -14,7 +14,7 @@ import requests from data_processing.utils import get_logger -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( Cluster, RayJobInfo, RayJobRequest, diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/__init__.py similarity index 65% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/__init__.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/__init__.py index e5a7d70fa..207f961a9 100644 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/__init__.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/__init__.py @@ -1,4 +1,4 @@ -from kfp_support.api_server_client.params.templates import ( +from python_apiserver_client.params.templates import ( TolerationOperation, TolerationEffect, Toleration, @@ -7,7 +7,7 @@ template_decoder, templates_decoder, ) -from kfp_support.api_server_client.params.volumes import ( +from python_apiserver_client.params.volumes import ( HostPath, MountPropagationMode, AccessMode, @@ -20,25 +20,25 @@ SecretVolume, volume_decoder, ) -from kfp_support.api_server_client.params.environmentvariables import ( +from python_apiserver_client.params.environmentvariables import ( EnvVarSource, EnvVarFrom, EnvironmentVariables, env_var_from_decoder, environment_variables_decoder, ) -from kfp_support.api_server_client.params.headnode import ( +from python_apiserver_client.params.headnode import ( ServiceType, HeadNodeSpec, DEFAULT_HEAD_START_PARAMS, head_node_spec_decoder, ) -from kfp_support.api_server_client.params.workernode import ( +from python_apiserver_client.params.workernode import ( WorkerNodeSpec, DEFAULT_WORKER_START_PARAMS, worker_node_spec_decoder, ) -from kfp_support.api_server_client.params.cluster import ( +from python_apiserver_client.params.cluster import ( Environment, AutoscalerOptions, ClusterSpec, @@ -50,4 +50,4 @@ cluster_decoder, clusters_decoder, ) -from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo +from python_apiserver_client.params.jobsubmission import RayJobRequest, RayJobInfo \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/cluster.py similarity index 99% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/cluster.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/cluster.py index 922a14bef..5e1ee4867 100644 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/cluster.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/cluster.py @@ -13,7 +13,7 @@ import enum from typing import Any -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( BaseVolume, EnvironmentVariables, HeadNodeSpec, diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/environmentvariables.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/environmentvariables.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/environmentvariables.py diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/headnode.py similarity index 99% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/headnode.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/headnode.py index 7a9d4120f..37c2e2572 100644 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/headnode.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/headnode.py @@ -13,7 +13,7 @@ import enum from typing import Any -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( BaseVolume, EnvironmentVariables, environment_variables_decoder, diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/jobsubmission.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/jobsubmission.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/jobsubmission.py diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/templates.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/templates.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/templates.py diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/volumes.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/volumes.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/volumes.py diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/workernode.py similarity index 99% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/workernode.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/workernode.py index ddcf193cc..3a9f8e439 100644 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/workernode.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/workernode.py @@ -12,7 +12,7 @@ from typing import Any -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( BaseVolume, EnvironmentVariables, environment_variables_decoder, diff --git a/kfp/kfp_support_lib/test/api_params_test.py b/kfp/kfp_support_lib/python_apiserver_client/test/api_params_test.py similarity index 99% rename from kfp/kfp_support_lib/test/api_params_test.py rename to kfp/kfp_support_lib/python_apiserver_client/test/api_params_test.py index 804c84aad..53740c939 100644 --- a/kfp/kfp_support_lib/test/api_params_test.py +++ b/kfp/kfp_support_lib/python_apiserver_client/test/api_params_test.py @@ -12,7 +12,7 @@ import json -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( DEFAULT_HEAD_START_PARAMS, DEFAULT_WORKER_START_PARAMS, AccessMode, diff --git a/kfp/kfp_support_lib/python_apiserver_client/test/configmaps.py b/kfp/kfp_support_lib/python_apiserver_client/test/configmaps.py new file mode 100644 index 000000000..65e53e828 --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/test/configmaps.py @@ -0,0 +1,72 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from kubernetes import client, config + + +CMAP_VALUE = """ +import ray +import os +import requests + +ray.init() + +@ray.remote +class Counter: + def __init__(self): + # Used to verify runtimeEnv + self.name = os.getenv("counter_name") + assert self.name == "test_counter" + self.counter = 0 + + def inc(self): + self.counter += 1 + + def get_counter(self): + return "{} got {}".format(self.name, self.counter) + +counter = Counter.remote() + +for _ in range(5): + ray.get(counter.inc.remote()) + print(ray.get(counter.get_counter.remote())) + +# Verify that the correct runtime env was used for the job. +assert requests.__version__ == "2.26.0" +""" +CMAP_NAME = "ray-job-code-sample" + + +class ConfigmapsManager: + """ + Simple support class to manage config maps. Assumes local access to Kubectl + """ + + def __init__(self): + config.load_kube_config() + self.api_instance = client.CoreV1Api() + + def list_configmaps(self) -> list[str]: + cm_list = self.api_instance.list_namespaced_config_map(namespace="default").items + return [cm.metadata.name for cm in cm_list] + + def create_code_map(self) -> None: + cmap = client.V1ConfigMap() + cmap.metadata = client.V1ObjectMeta(name=CMAP_NAME) + cmap.data = {"sample_code.py": CMAP_VALUE} + self.api_instance.create_namespaced_config_map(namespace="default", body=cmap) + + def delete_code_map(self) -> None: + try: + self.api_instance.delete_namespaced_config_map(name="ray-job-code-sample", namespace="default") + except Exception as e: + print("config map ray-job-code-sample does not exist") diff --git a/kfp/kfp_support_lib/test/kuberay_api_test.py b/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py similarity index 97% rename from kfp/kfp_support_lib/test/kuberay_api_test.py rename to kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py index b2a444ce3..d4dd12a5e 100644 --- a/kfp/kfp_support_lib/test/kuberay_api_test.py +++ b/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py @@ -13,8 +13,8 @@ import time from configmaps import ConfigmapsManager -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( +from python_apiserver_client import KubeRayAPIs +from python_apiserver_client.params import ( DEFAULT_WORKER_START_PARAMS, Cluster, ClusterSpec, @@ -30,13 +30,15 @@ WorkerNodeSpec, ) +server_url = "http://localhost:8080/ray" + def test_templates(): """ Test template """ # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + apis = KubeRayAPIs(server_url=server_url) # cleanup _, _ = apis.delete_compute_template(ns="default", name="default-template") # create @@ -81,7 +83,7 @@ def test_cluster(): Test cluster """ # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + apis = KubeRayAPIs(server_url=server_url) # cleanup _, _ = apis.delete_compute_template(ns="default", name="default-template") _, _ = apis.delete_cluster(ns="default", name="test") @@ -181,7 +183,7 @@ def test_job_submission(): :return: """ # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + apis = KubeRayAPIs(server_url=server_url) # cleanup _, _ = apis.delete_compute_template(ns="default", name="default-template") _, _ = apis.delete_cluster(ns="default", name="test-job") diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/README.md b/kfp/kfp_support_lib/src/kfp_support/api_server_client/README.md deleted file mode 100644 index 423f743a1..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# KubeRay API server APIs - -This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) -Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/__init__.py deleted file mode 100644 index 60cbbc2f2..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/README.md b/kfp/kfp_support_lib/src/kfp_support/workflow_support/README.md deleted file mode 100644 index b477e9a42..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Workflow Utils - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## PipelinesUtils - -This class provides some higher level functionality based on the capabilities of the python KFP client, including" -* get_experiment_by_name obtains KFP experiment object based on its name -* get_pipeline_by_name obtains KFP pipeline object based on its name -* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a -dictionary of parameters. It returns kfp run ID -* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/__init__.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/__init__.py deleted file mode 100644 index 166032380..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from kfp_support.workflow_support.utils.kfp_utils import KFPUtils -from kfp_support.workflow_support.utils.pipeline_utils import PipelinesUtils -from kfp_support.workflow_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC -from kfp_support.workflow_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/requirements.env b/kfp/requirements.env index ef5110bcc..6fa707df5 100644 --- a/kfp/requirements.env +++ b/kfp/requirements.env @@ -1,2 +1,11 @@ RAY=2.9.3 -KFP=1.8.22 +KFP_v2=2.7.0 +KFP_v1=1.8.22 + +ifeq ($(KFPv2), 1) + KFP=$(KFP_v2) + WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support +else + KFP=$(KFP_v1) + WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support +endif diff --git a/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py b/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py index f63bb0638..a64154237 100644 --- a/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py +++ b/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py @@ -1,7 +1,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ONE_WEEK_SEC +from kfp_support.workflow_support.runtime_utils import ONE_WEEK_SEC # Components diff --git a/kind/Makefile b/kind/Makefile index a20a8ea76..da22e24f3 100644 --- a/kind/Makefile +++ b/kind/Makefile @@ -2,7 +2,13 @@ # know where they are running from. export REPOROOT=${CURDIR}/../ -IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kind/requirements.env | sed 's/=/:=/' | sed 's/^/export /' > makeenv") + +ifneq ($(KFPv2), 1) + GREP_V=KFP_V2 +else + GREP_V=KFP_V1 +endif +IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kind/requirements.env | sed 's/=/:=/;/$(GREP_V)/d;s/KFP_V._//;s/^/export /' > makeenv") include makeenv diff --git a/kind/README.md b/kind/README.md index e4a106f25..60fae6841 100644 --- a/kind/README.md +++ b/kind/README.md @@ -28,7 +28,10 @@ amount of node, modify [cluster configuration](hack/kind-cluster-config.yaml) Install [Kubeflow Pipelines](https://www.kubeflow.org/docs/components/pipelines/v1/installation/standalone-deployment/#deploying-kubeflow-pipelines) and wait for it to be ready: ```shell -cd $ROOT_DIR/hack/tools/ && PIPELINE_VERSION=1.8.5 ./install_kubeflow.sh deploy && cd - +# Set required KFP version. You can reference to the latest supported version in the [requirements.env](./requirements.env) file. +# Currently, we support 1.8.5 for KFPv1 and 2.2.0 for KFP v2 +export PIPELINE_VERSION=1.8.5 +cd $ROOT_DIR/hack/tools/ && ./install_kubeflow.sh deploy && cd - kubectl wait --for=condition=ready --all pod -n kubeflow --timeout=300s ``` @@ -56,7 +59,7 @@ kubectl wait --namespace ingress-nginx \ --timeout=90s ``` -To deploy the ingress for ray apiserver, kfp and Minio execute the following: +To deploy the ingress for Ray API Server, KFP and MinIO execute the following: ```shell kubectl apply -f $ROOT_DIR/hack/ray_api_server_ingress.yaml kubectl apply -f $ROOT_DIR/hack/kfp_ingress.yaml diff --git a/kind/requirements.env b/kind/requirements.env index 70eca5bd8..cedd6ba0e 100644 --- a/kind/requirements.env +++ b/kind/requirements.env @@ -1,4 +1,6 @@ -PIPELINE_VERSION=1.8.5 +KFP_V1_PIPELINE_VERSION=1.8.5 +KFP_V2_PIPELINE_VERSION=2.2.0 + KUBERAY_OPERATOR=1.0.0 KUBERAY_APISERVER=1.1.0 diff --git a/transforms/.make.workflows b/transforms/.make.workflows index 4d0d47617..4a9d0d0a8 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -6,23 +6,15 @@ include ${REPOROOT}/kfp/requirements.env include ${REPOROOT}/.make.defaults USE_DEV_IMAGES ?= 1 -TRANSFORM_RUNTIME = ray define set_env_var $(eval export $(1)=$(2)) endef +# FIXME .PHONY: .transforms_workflows.reconcile-requirements .transforms_workflows.reconcile-requirements: - cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) reconcile-requirements - @while IFS= read -r line; do \ - [ -z "$$line" ] && continue; \ - [[ $$line == *#* ]] && continue; \ - export DOCKER_IMAGE_NAME=$$(echo $$line |cut -d "=" -f 1 |sed "s/_VERSION//" |tr '[:upper:]' '[:lower:]'); \ - export DOCKER_IMAGE_VERSION=$$(echo $$line |cut -d "=" -f 2); \ - sed -i.back "s/data-prep-kit\/$$DOCKER_IMAGE_NAME\-${TRANSFORM_RUNTIME}:.*/data-prep-kit\/$$DOCKER_IMAGE_NAME\-${TRANSFORM_RUNTIME}:$$DOCKER_IMAGE_VERSION\"/" $$PIPELINE_FILE ;\ - done < ${REPOROOT}/.make.versions - @sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:${KFP_DOCKER_VERSION}\"/" ${PIPELINE_FILE} + .PHONY: .transforms_workflows.compile-pipeline .transforms_workflows.compile-pipeline: @@ -45,20 +37,21 @@ ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) load-image endif - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m kfp_support.workflow_support.utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${REPOROOT}/kfp/kfp_support_lib/ rm -rf ${REPOROOT}/transforms/venv $(MAKE) -C ${REPOROOT}/transforms .defaults.python-lib-src-venv . ${WORKFLOW_VENV_ACTIVATE}; \ - pip install -e $(REPOROOT)/kfp/kfp_support_lib/; + pip install -e $(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client; \ + pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); @# Help: Create the virtual environment common to all workflows - + .PHONY: .transforms_workflows.upload-pipeline .transforms_workflows.upload-pipeline: $(call set_env_var, CLUSTER_EXISTS, $(shell kind get clusters | grep ${KIND_CLUSTER_NAME})) @if [ -z ${CLUSTER_EXISTS} ]; then \ cd ${REPOROOT} && make setup; \ fi - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m kfp_support.workflow_support.utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} diff --git a/transforms/code/code_quality/Makefile b/transforms/code/code_quality/Makefile index 14c9b098a..5cc85aab2 100644 --- a/transforms/code/code_quality/Makefile +++ b/transforms/code/code_quality/Makefile @@ -47,21 +47,21 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/code/code_quality/kfp_ray/Makefile b/transforms/code/code_quality/kfp_ray/Makefile new file mode 100644 index 000000000..d93c668c1 --- /dev/null +++ b/transforms/code/code_quality/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=code_quality_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py similarity index 66% rename from transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py rename to transforms/code/code_quality/kfp_ray/code_quality_wf.py index 09b1e6cb7..b89f74083 100644 --- a/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -9,35 +9,83 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - -# NOTE: This file is auto generated by Pipeline Generator. +import os import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils # the name of the job script EXEC_SCRIPT_NAME: str = "code_quality_transform_ray.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.4.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" -component_spec_path = "../../../../../kfp/kfp_ray_components/" +# path to kfp component specifications files +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + cq_contents_column_name: str, + cq_language_column_name: str, + cq_tokenizer: str, + cq_hf_token: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "cq_contents_column_name": cq_contents_column_name, + "cq_language_column_name": cq_language_column_name, + "cq_tokenizer": cq_tokenizer, + "cq_hf_token": cq_hf_token, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -120,7 +168,7 @@ def code_quality( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -128,7 +176,18 @@ def code_quality( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + cq_contents_column_name=cq_contents_column_name, + cq_language_column_name=cq_language_column_name, + cq_tokenizer=cq_tokenizer, + cq_hf_token=cq_hf_token, ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( @@ -147,19 +206,8 @@ def code_quality( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "cq_contents_column_name": cq_contents_column_name, - "cq_language_column_name": cq_language_column_name, - "cq_tokenizer": cq_tokenizer, - "cq_hf_token": cq_hf_token, - }, + # note that the parameters below are specific for NOOP transform + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/code/code_quality/kfp_ray/v1/Makefile b/transforms/code/code_quality/kfp_ray/v1/Makefile deleted file mode 100644 index ae484ed12..000000000 --- a/transforms/code/code_quality/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=code_quality_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=code_quality_wf.py diff --git a/transforms/code/malware/Makefile b/transforms/code/malware/Makefile index 14c9b098a..41413c041 100644 --- a/transforms/code/malware/Makefile +++ b/transforms/code/malware/Makefile @@ -47,21 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements - + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/code/malware/kfp_ray/Makefile b/transforms/code/malware/kfp_ray/Makefile new file mode 100644 index 000000000..ce744072d --- /dev/null +++ b/transforms/code/malware/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=malware_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/code/malware/kfp_ray/v1/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py similarity index 67% rename from transforms/code/malware/kfp_ray/v1/malware_wf.py rename to transforms/code/malware/kfp_ray/malware_wf.py index 7f65b3a9c..d0e22643b 100644 --- a/transforms/code/malware/kfp_ray/v1/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -10,32 +10,78 @@ # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils + import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) # the name of the job script EXEC_SCRIPT_NAME: str = "malware_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.5.0" +task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.5.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + malware_input_column: str, + malware_output_column: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "malware_input_column": malware_input_column, + "malware_output_column": malware_output_column, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -107,7 +153,7 @@ def malware( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -115,6 +161,14 @@ def malware( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + malware_input_column=malware_input_column, + malware_output_column=malware_output_column, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster @@ -134,18 +188,7 @@ def malware( run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, # note that the parameters below are specific for malware transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "malware_input_column": malware_input_column, - "malware_output_column": malware_output_column, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/code/malware/kfp_ray/v1/Makefile b/transforms/code/malware/kfp_ray/v1/Makefile deleted file mode 100644 index d673ca682..000000000 --- a/transforms/code/malware/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=malware_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=malware_wf.py diff --git a/transforms/code/proglang_select/Makefile b/transforms/code/proglang_select/Makefile index 4991e4002..e7ad671da 100644 --- a/transforms/code/proglang_select/Makefile +++ b/transforms/code/proglang_select/Makefile @@ -47,20 +47,21 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements + diff --git a/transforms/code/proglang_select/kfp_ray/Makefile b/transforms/code/proglang_select/kfp_ray/Makefile new file mode 100644 index 000000000..2bdfb2d1d --- /dev/null +++ b/transforms/code/proglang_select/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=proglang_select_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py similarity index 68% rename from transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py rename to transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 14b17ac32..ad256903f 100644 --- a/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -10,32 +10,78 @@ # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils + import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) # the name of the job script EXEC_SCRIPT_NAME: str = "proglang_select_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.4.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + proglang_select_allowed_langs_file: str, + proglang_select_language_column: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "proglang_select_allowed_langs_file": proglang_select_allowed_langs_file, + "proglang_select_language_column": proglang_select_language_column, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -111,7 +157,7 @@ def lang_select( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -119,6 +165,14 @@ def lang_select( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + proglang_select_allowed_langs_file=proglang_select_allowed_langs_file, + proglang_select_language_column=proglang_select_language_column, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster @@ -137,19 +191,8 @@ def lang_select( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "proglang_select_allowed_langs_file": proglang_select_allowed_langs_file, - "proglang_select_language_column": proglang_select_language_column, - }, + # note that the parameters below are specific for this transform + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, prefix=PREFIX, diff --git a/transforms/code/proglang_select/kfp_ray/v1/Makefile b/transforms/code/proglang_select/kfp_ray/v1/Makefile deleted file mode 100644 index e2c8c8b14..000000000 --- a/transforms/code/proglang_select/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=proglang_select_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=proglang_select_wf.py diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index b7c9b04b3..da86986db 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/doc_id/kfp_ray/Makefile b/transforms/universal/doc_id/kfp_ray/Makefile new file mode 100644 index 000000000..54b7e3781 --- /dev/null +++ b/transforms/universal/doc_id/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=doc_id_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py similarity index 63% rename from transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py rename to transforms/universal/doc_id/kfp_ray/doc_id_wf.py index d7cbb11dd..5cbb3e974 100644 --- a/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -9,40 +9,91 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -task_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.4.0.dev6" # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" - # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + data_checkpointing: bool, + data_data_sets: str, + data_files_to_use: str, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + doc_id_doc_column: str, + doc_id_hash_column: str, + doc_id_int_column: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "data_checkpointing": data_checkpointing, + "data_data_sets": data_data_sets, + "data_files_to_use": data_files_to_use, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "doc_id_doc_column": doc_id_doc_column, + "doc_id_hash_column": doc_id_hash_column, + "doc_id_int_column": doc_id_int_column, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") - # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "doc_id" @@ -114,7 +165,7 @@ def doc_id( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -122,12 +173,24 @@ def doc_id( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + data_checkpointing=data_checkpointing, + data_data_sets=data_data_sets, + data_files_to_use=data_files_to_use, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + doc_id_doc_column=doc_id_doc_column, + doc_id_hash_column=doc_id_hash_column, + doc_id_int_column=doc_id_int_column, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -138,25 +201,10 @@ def doc_id( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "data_checkpointing": data_checkpointing, - "data_data_sets": data_data_sets, - "data_files_to_use": data_files_to_use, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "doc_id_doc_column": doc_id_doc_column, - "doc_id_hash_column": doc_id_hash_column, - "doc_id_int_column": doc_id_int_column, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -164,8 +212,9 @@ def doc_id( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/doc_id/kfp_ray/v1/Makefile b/transforms/universal/doc_id/kfp_ray/v1/Makefile deleted file mode 100644 index e33049af4..000000000 --- a/transforms/universal/doc_id/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=doc_id_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=doc_id_wf.py diff --git a/transforms/universal/ededup/Makefile b/transforms/universal/ededup/Makefile index 7735c3251..a766f453e 100644 --- a/transforms/universal/ededup/Makefile +++ b/transforms/universal/ededup/Makefile @@ -47,20 +47,19 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/ededup/kfp_ray/Makefile b/transforms/universal/ededup/kfp_ray/Makefile new file mode 100644 index 000000000..235258fd6 --- /dev/null +++ b/transforms/universal/ededup/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=ededup_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py similarity index 74% rename from transforms/universal/ededup/kfp_ray/v1/ededup_wf.py rename to transforms/universal/ededup/kfp_ray/ededup_wf.py index 89682fe6c..6297470e9 100644 --- a/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -9,31 +9,51 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from src.ededup_compute_execution_params import ededup_compute_execution_params +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -from src.ededup_compute_execution_params import ededup_compute_execution_params +task_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.4.0.dev6" + # the name of the job script EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.4.0" - # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=ededup_compute_execution_params, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func( + func=ededup_compute_execution_params, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = dsl.RUN_ID_PLACEHOLDER -# compute execution parameters -compute_exec_params_op = comp.func_to_container_op(func=ededup_compute_execution_params, base_image=base_kfp_image) # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -110,7 +130,7 @@ def ededup( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -118,7 +138,14 @@ def ededup( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, - params={"s3_config": data_s3_config, "hash_cpu": ededup_hash_cpu}, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + doc_column=ededup_doc_column, + hash_cpu=ededup_hash_cpu, n_samples=ededup_n_samples, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) @@ -127,7 +154,7 @@ def ededup( # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -138,21 +165,9 @@ def ededup( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.outputs["workers"], - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "ededup_doc_column": ededup_doc_column, - "ededup_hash_cpu": ededup_hash_cpu, - "ededup_num_hashes": compute_exec_params.outputs["hashes"], - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -160,8 +175,9 @@ def ededup( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py similarity index 71% rename from transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py rename to transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py index 529a6ace3..16a5a0c28 100644 --- a/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py +++ b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py @@ -10,24 +10,37 @@ # limitations under the License. ################################################################################ -from typing import Any, NamedTuple +from typing import Any def ededup_compute_execution_params( worker_options: str, # ray worker configuration actor_options: str, # actor's resource requirements - params: dict[str, Any], # exact dedup specific parameters - n_samples: int = 10, # number of samples to use -) -> NamedTuple("Output", [("workers", int), ("hashes", int)]): + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: str, # code location + doc_column: str, # key for accessing data + hash_cpu: float, # number of CPUs per hash + n_samples: int, # number of samples for parameters computation +) -> dict: """ Compute exact dedup execution parameters :param worker_options: cluster parameters :param actor_options: actor request requirements :param n_samples: number of samples to use - :param params: exact dedup specific parameters containing the following keys: - s3_config - s3 config - hash_cpu - hash cpu requirements - :return: json string, containing computed number of workers and hashes + :param data_s3_config - s3 config + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_pipeline_id - pipeline id + :param runtime_job_id - job id, or just a unique string + :param runtime_code_location - code location + :param doc_column - key for accessing data + :param hash_cpu - number of CPUs per hash + :param n_samples - umber of samples for parameters computation + :return: a dictionary with a Ray Job execution parameters """ # required import import math @@ -35,7 +48,7 @@ def ededup_compute_execution_params( from data_processing.data_access import DataAccessS3 from data_processing.utils import GB, KB - from kfp_support.workflow_support.utils import KFPUtils + from workflow_support.runtime_utils import KFPUtils EXECUTION_OF_KB_DOC = 0.00025 @@ -53,7 +66,7 @@ def ededup_compute_execution_params( # get credentials s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) if type(s3_config) is list: # S3 config is list. take the first element s3_config = s3_config[0] @@ -71,7 +84,6 @@ def ededup_compute_execution_params( n_hashes = math.ceil(number_of_docs * 32 / GB) print(f"Estimated Required hashes {n_hashes}") print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - hash_cpu: float = float(params.get("hash_cpu")) required_hash_cpu = n_hashes * hash_cpu required_hash_mem = n_hashes * 2 if required_hash_cpu > cluster_cpu or required_hash_mem > cluster_memory: @@ -97,6 +109,16 @@ def ededup_compute_execution_params( print(f"Try to increase the size of the cluster or increase size of the cpu per worker") sys.exit(1) print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min") - # return json.dumps({"workers": n_workers, "hashes": n_hashes}) - return (n_workers, n_hashes) - # return (1, 1) + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": n_workers, + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "ededup_doc_column": doc_column, + "ededup_hash_cpu": hash_cpu, + "ededup_num_hashes": n_hashes, + } diff --git a/transforms/universal/ededup/kfp_ray/v1/Makefile b/transforms/universal/ededup/kfp_ray/v1/Makefile deleted file mode 100644 index 66331ebfb..000000000 --- a/transforms/universal/ededup/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=ededup_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=ededup_wf.py diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile index 7735c3251..41413c041 100644 --- a/transforms/universal/fdedup/Makefile +++ b/transforms/universal/fdedup/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/fdedup/kfp_ray/Makefile b/transforms/universal/fdedup/kfp_ray/Makefile new file mode 100644 index 000000000..f741801bc --- /dev/null +++ b/transforms/universal/fdedup/kfp_ray/Makefile @@ -0,0 +1,40 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=fdedup_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py similarity index 73% rename from transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py rename to transforms/universal/fdedup/kfp_ray/fdedup_wf.py index d0feea492..c3e21a85b 100644 --- a/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -9,31 +9,51 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from src.fdedup_compute_execution_params import fdedup_compute_execution_params +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -from src.fdedup_compute_execution_params import fdedup_compute_execution_params +task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.4.0.dev6" + # the name of the job script EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.4.0" - # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=fdedup_compute_execution_params, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func( + func=fdedup_compute_execution_params, base_image=base_kfp_image + ) + run_id = dsl.RUN_ID_PLACEHOLDER -# compute execution parameters -compute_exec_params_op = comp.func_to_container_op(func=fdedup_compute_execution_params, base_image=base_kfp_image) # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -139,7 +159,7 @@ def fdedup( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -147,14 +167,26 @@ def fdedup( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, - params={ - "threshold": fdedup_threshold, - "num_permutations": fdedup_num_permutations, - "s3_config": data_s3_config, - "bucket_cpu": fdedup_bucket_cpu, - "doc_cpu": fdedup_doc_cpu, - "minhash_cpu": fdedup_mhash_cpu, - }, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + doc_column=fdedup_doc_column, + id_column=fdedup_id_column, + cluster_column=fdedup_cluster_column, + bucket_cpu=fdedup_bucket_cpu, + doc_cpu=fdedup_doc_cpu, + mhash_cpu=fdedup_mhash_cpu, + num_permutations=fdedup_num_permutations, + threshold=fdedup_threshold, + shingles_size=fdedup_shingles_size, + delimiters=fdedup_delimiters, + random_delay_limit=fdedup_random_delay_limit, + snapshot_delay=fdedup_snapshot_delay, + use_doc_snapshot=fdedup_use_doc_snapshot, + use_bucket_snapshot=fdedup_use_bucket_snapshot, n_samples=fdedup_n_samples, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) @@ -163,7 +195,7 @@ def fdedup( # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -174,36 +206,9 @@ def fdedup( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.outputs["workers"], - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "fdedup_doc_column": fdedup_doc_column, - "fdedup_id_column": fdedup_id_column, - "fdedup_cluster_column": fdedup_cluster_column, - "fdedup_bucket_cpu": fdedup_bucket_cpu, - "fdedup_doc_cpu": fdedup_doc_cpu, - "fdedup_mhash_cpu": fdedup_mhash_cpu, - "fdedup_num_doc_actors": compute_exec_params.outputs["docs"], - "fdedup_num_bucket_actors": compute_exec_params.outputs["buckets"], - "fdedup_num_minhash_actors": compute_exec_params.outputs["min_hashes"], - "fdedup_num_preprocessors": compute_exec_params.outputs["preprocessors"], - "fdedup_num_permutations": fdedup_num_permutations, - "fdedup_threshold": fdedup_threshold, - "fdedup_shingles_size": fdedup_shingles_size, - "fdedup_delimiters": fdedup_delimiters, - "fdedup_random_delay_limit": fdedup_random_delay_limit, - "fdedup_snapshot_delay": fdedup_snapshot_delay, - "fdedup_use_doc_snapshot": fdedup_use_doc_snapshot, - "fdedup_use_bucket_snapshot": fdedup_use_bucket_snapshot, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -211,8 +216,9 @@ def fdedup( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py similarity index 62% rename from transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py rename to transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index a9f8b8d66..9d07940c1 100644 --- a/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -16,37 +16,65 @@ def fdedup_compute_execution_params( worker_options: str, # ray worker configuration actor_options: str, # actor's resource requirements - params: dict[str, Any], # fuzzy dedup specific parameters - n_samples: int = 10, # number of samples to use -) -> NamedTuple( - "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] -): + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: str, # code location + doc_column: str, # document column name + id_column: str, # integer document id column name + cluster_column: str, # cluster column name + bucket_cpu: float, # number of CPUs per bucket hash + doc_cpu: float, # number of CPUs per doc hash + mhash_cpu: float, # number of CPUs per minhash hash + num_permutations: int, # number of permutations + threshold: float, # threshold, + shingles_size: int, # number of words in shingle + delimiters: str, # delimiter for splitting document + random_delay_limit: int, # delay between reads to reduce S3 load. + # A random number between 0 and random_delay_limit is used + snapshot_delay: int, # delay between restoring individual actors + use_doc_snapshot: bool, # flag to skip documents building and start from existing snapshots + use_bucket_snapshot: bool, # flag to skip buckets building and start from existing snapshots + n_samples: int, # number of samples to use +) -> dict: # NamedTuple( + # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] + """ Compute fuzzy dedup execution parameters :param worker_options: cluster parameters :param actor_options: actor request requirements + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param doc_column: document column name + :param id_column: integer document id column name + :param cluster_column: cluster column name + :param bucket_cpu: number of CPUs per bucket hash + :param doc_cpu: number of CPUs per doc hash + :param mhash_cpu: number of CPUs per minhash hash + :param num_permutations: number of permutations + :param threshold: threshold, + :param shingles_size: number of words in shingle + :param delimiters: delimiter for splitting document + :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used + :param snapshot_delay: delay between restoring individual actors + :param use_doc_snapshot: flag to skip documents building and start from existing snapshots + :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots :param n_samples: number of samples to use - :param params: fuzzy dedup specific parameters containing the following keys: - threshold - threshold for fuzzy computations - num_permutations - number of permutation - s3_config - s3 config - bucket_cpu - bucket actor cpu requirements - minhash_cpu - minhash actor cpu requirements - doc_cpu - doc actor cpu requirements - :return: json string, containing - workers - number of workers - preprocessors - number of preprocessors - docs - number of doc actors - buckets - number of bucket actors - min_hashes - number of minhash actors + :return: a dictionary with a Ray Job execution parameters """ import math import sys from data_processing.data_access import DataAccessS3 from data_processing.utils import GB, KB - from kfp_support.workflow_support.utils import KFPUtils from scipy.integrate import quad as integrate + from workflow_support.runtime_utils import KFPUtils EXECUTION_OF_KB_DOC = 0.003 @@ -104,8 +132,8 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: # fuzzy parameters num_buckets, length_bucket = fuzzy_optimal_param( - threshold=float(params.get("threshold")), - num_perm=int(params.get("num_permutations")), + threshold=threshold, + num_perm=num_permutations, false_positive_weight=0.5, false_negative_weight=0.5, ) @@ -124,7 +152,7 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: # get credentials s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) if type(s3_config) is list: # S3 config is list. take the first element s3_config = s3_config[0] @@ -143,13 +171,10 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) # compute cpu requirements - bucket_cpu = float(params.get("bucket_cpu")) - min_hash_cpu = float(params.get("minhash_cpu")) - doc_cpu = float(params.get("doc_cpu")) # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount # of CPUs n_preprocessors = int( - (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * min_hash_cpu - d_actors * doc_cpu) / actor_cpu + (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu ) if n_preprocessors < 0: print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") @@ -176,9 +201,36 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: print( f"Required cpu : " - f"{b_actors * bucket_cpu + m_actors * min_hash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" + f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" ) projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 print(f"Projected execution time {projected_execution} min") - return (n_workers, n_preprocessors, d_actors, b_actors, m_actors) + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": n_workers, + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "fdedup_doc_column": doc_column, + "fdedup_id_column": id_column, + "fdedup_cluster_column": cluster_column, + "fdedup_bucket_cpu": bucket_cpu, + "fdedup_doc_cpu": doc_cpu, + "fdedup_mhash_cpu": mhash_cpu, + "fdedup_num_doc_actors": d_actors, + "fdedup_num_bucket_actors": b_actors, + "fdedup_num_minhash_actors": m_actors, + "fdedup_num_preprocessors": n_preprocessors, + "fdedup_num_permutations": num_permutations, + "fdedup_threshold": threshold, + "fdedup_shingles_size": shingles_size, + "fdedup_delimiters": delimiters, + "fdedup_random_delay_limit": random_delay_limit, + "fdedup_snapshot_delay": snapshot_delay, + "fdedup_use_doc_snapshot": use_doc_snapshot, + "fdedup_use_bucket_snapshot": use_bucket_snapshot, + } diff --git a/transforms/universal/fdedup/kfp_ray/v1/Makefile b/transforms/universal/fdedup/kfp_ray/v1/Makefile deleted file mode 100644 index 8a82e5d18..000000000 --- a/transforms/universal/fdedup/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=fdedup_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=fdedup_wf.py diff --git a/transforms/universal/filter/Makefile b/transforms/universal/filter/Makefile index 4991e4002..6104574ea 100644 --- a/transforms/universal/filter/Makefile +++ b/transforms/universal/filter/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/filter/kfp_ray/Makefile b/transforms/universal/filter/kfp_ray/Makefile new file mode 100644 index 000000000..4d8779a25 --- /dev/null +++ b/transforms/universal/filter/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=filter_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/filter/kfp_ray/v1/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py similarity index 67% rename from transforms/universal/filter/kfp_ray/v1/filter_wf.py rename to transforms/universal/filter/kfp_ray/filter_wf.py index d4a413dab..90d2b197b 100644 --- a/transforms/universal/filter/kfp_ray/v1/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -10,35 +10,80 @@ # limitations under the License. ################################################################################ -# NOTE: This file is auto generated by Pipeline Generator. +import os import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils # the name of the job script EXEC_SCRIPT_NAME: str = "filter_transform_ray.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:0.4.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + filter_criteria_list: str, + filter_logical_operator: str, + filter_columns_to_drop: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "filter_criteria_list": filter_criteria_list, + "filter_logical_operator": filter_logical_operator, + "filter_columns_to_drop": filter_columns_to_drop, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -113,7 +158,7 @@ def filtering( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -121,7 +166,17 @@ def filtering( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + filter_criteria_list=filter_criteria_list, + filter_logical_operator=filter_logical_operator, + filter_columns_to_drop=filter_columns_to_drop, ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( @@ -140,19 +195,7 @@ def filtering( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "filter_criteria_list": filter_criteria_list, - "filter_logical_operator": filter_logical_operator, - "filter_columns_to_drop": filter_columns_to_drop, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/universal/filter/kfp_ray/v1/Makefile b/transforms/universal/filter/kfp_ray/v1/Makefile deleted file mode 100644 index b7696b246..000000000 --- a/transforms/universal/filter/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=filter_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=filter_wf.py diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile index 7735c3251..41413c041 100644 --- a/transforms/universal/noop/Makefile +++ b/transforms/universal/noop/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/noop/kfp_ray/v1/Makefile b/transforms/universal/noop/kfp_ray/Makefile similarity index 84% rename from transforms/universal/noop/kfp_ray/v1/Makefile rename to transforms/universal/noop/kfp_ray/Makefile index 0a9f8a6d4..4f1d5ee7c 100644 --- a/transforms/universal/noop/kfp_ray/v1/Makefile +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -1,19 +1,31 @@ -REPOROOT=${CURDIR}/../../../../../ +REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -SRC_DIR=${CURDIR}/../../ray/ +SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + .PHONY: workflow-build workflow-build: workflow-venv - @for file in $(YAML_WF); do \ - $(MAKE) $$file; \ - done + $(MAKE) $(YAML_WF) .PHONY: workflow-test workflow-test: workflow-build diff --git a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py similarity index 68% rename from transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py rename to transforms/universal/noop/kfp_ray/noop_multiple_wf.py index cf374c8af..67b4aead0 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -9,15 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) task_image = "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6" @@ -29,13 +27,60 @@ base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, +) -> dict: + import uuid + + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "noop_sleep_sec": noop_sleep_sec, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -107,7 +152,7 @@ def noop( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -115,12 +160,19 @@ def noop( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -131,20 +183,10 @@ def noop( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -152,8 +194,9 @@ def noop( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/noop/kfp_ray/v1/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py similarity index 67% rename from transforms/universal/noop/kfp_ray/v1/noop_wf.py rename to transforms/universal/noop/kfp_ray/noop_wf.py index d43b88189..8748a60ca 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -9,15 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) task_image = "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6" @@ -29,19 +27,64 @@ base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "noop_sleep_sec": noop_sleep_sec, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") + # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "noop" @@ -53,7 +96,7 @@ def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' '"image": "' + task_image + '"}', + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' '"image_pull_secret": "", "image": "' + task_image + '"}', server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -106,7 +149,7 @@ def noop( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -114,12 +157,19 @@ def noop( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -130,20 +180,10 @@ def noop( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -151,8 +191,9 @@ def noop( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile index 7735c3251..41413c041 100644 --- a/transforms/universal/tokenization/Makefile +++ b/transforms/universal/tokenization/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile new file mode 100644 index 000000000..7d5aa6687 --- /dev/null +++ b/transforms/universal/tokenization/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=tokenization_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py similarity index 66% rename from transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py rename to transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 2ff84bdfd..f74d0a331 100644 --- a/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -9,32 +9,117 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - # the name of the job script EXEC_SCRIPT_NAME: str = "tokenization_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:0.3.0" +task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:0.4.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" +# path to kfp component specifications files + # path to kfp component specifications files +component_spec_path = "../../../../kfp/kfp_ray_components/" + -component_spec_path = "../../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + tkn_tokenizer: str, + tkn_tokenizer_args: str, + tkn_doc_id_column: str, + tkn_doc_content_column: str, + tkn_text_lang: str, + tkn_chunk_size: int + + +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "tkn_tokenizer": tkn_tokenizer, + "tkn_tokenizer_args": tkn_tokenizer_args, + "tkn_doc_id_column": tkn_doc_id_column, + "tkn_doc_content_column": tkn_doc_content_column, + "tkn_text_lang": tkn_text_lang, + "tkn_chunk_size": tkn_chunk_size, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + + + + + + + + + + + + + + + + + + + + + + + + + + + + + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -116,7 +201,7 @@ def tokenization( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -124,10 +209,21 @@ def tokenization( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + tkn_tokenizer=tkn_tokenizer, + tkn_tokenizer_args=tkn_tokenizer_args, + tkn_doc_id_column=tkn_doc_id_column, + tkn_doc_content_column=tkn_doc_content_column, + tkn_text_lang=tkn_text_lang, + tkn_chunk_size=tkn_chunk_size, ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) - # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, @@ -144,22 +240,7 @@ def tokenization( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "tkn_tokenizer": tkn_tokenizer, - "tkn_tokenizer_args": tkn_tokenizer_args, - "tkn_doc_id_column": tkn_doc_id_column, - "tkn_doc_content_column": tkn_doc_content_column, - "tkn_text_lang": tkn_text_lang, - "tkn_chunk_size": tkn_chunk_size, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/universal/tokenization/kfp_ray/v1/Makefile b/transforms/universal/tokenization/kfp_ray/v1/Makefile deleted file mode 100644 index 5814e2935..000000000 --- a/transforms/universal/tokenization/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=tokenization_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=tokenization_wf.py