diff --git a/.make.versions b/.make.versions index 0dcbc7520..a8b443a98 100644 --- a/.make.versions +++ b/.make.versions @@ -18,26 +18,37 @@ DPK_LIB_KFP_SHARED=0.2.0$(RELEASE_VERSION_SUFFIX) # Begin transform versions/tags BLOCKLIST_VERSION=0.4.2$(RELEASE_VERSION_SUFFIX) -DOC_ID_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -DOC_ID_SPARK_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX) -EDEDUP_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -FDEDUP_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + +DOC_ID_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) +DOC_ID_SPARK_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + +EDEDUP_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + +FDEDUP_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + FILTER_PYTHON_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) FILTER_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -FILTER_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) FILTER_SPARK_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX) + NOOP_PYTHON_VERSION=0.9.0$(RELEASE_VERSION_SUFFIX) NOOP_RAY_VERSION=0.9.0$(RELEASE_VERSION_SUFFIX) NOOP_SPARK_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX) + RESIZE_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + LANG_ID_PYTHON_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) LANG_ID_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -TOKENIZATION_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -MALWARE_VERSION=0.5.0$(RELEASE_VERSION_SUFFIX) -PROGLANG_SELECT_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -CODE_QUALITY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -DOC_QUALITY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) -INGEST_TO_PARQUET_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) +TOKENIZATION_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) +TOKENIZATION_PYTHON_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + +MALWARE_RAY_VERSION=0.5.0$(RELEASE_VERSION_SUFFIX) + +PROGLANG_SELECT_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + +CODE_QUALITY_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) + +INGEST_TO_PARQUET_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) +INGEST_TO_PARQUET_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX) KFP_DOCKER_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX) diff --git a/data-processing-lib/doc/spark-runtime.md b/data-processing-lib/doc/spark-runtime.md index 87dd20c18..80bfa9d62 100644 --- a/data-processing-lib/doc/spark-runtime.md +++ b/data-processing-lib/doc/spark-runtime.md @@ -4,7 +4,7 @@ The Spark runtime extends the base framework with the following set of component ## Transforms * [AbstractSparkTransform](../spark/src/data_processing_spark/runtime/spark/spark_transform.py) - this - is the base class for all spark-based tranforms over spark DataFrames. + is the base class for all spark-based transforms over spark DataFrames. * [SparkTransformConfiguration](../spark/src/data_processing_spark/runtime/spark/spark_transform_config.py) - this is simple extension of the base TransformConfiguration class to hold the transformation class (an extension of AbstractSpartTransform). diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 539d3cdf5..9c76ee559 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -56,7 +56,7 @@ Ray cluster. For each step we have to define a component that will execute them: ```python # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2" - # compute execution parameters. Here different tranforms might need different implementations. As + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image diff --git a/kfp/pipeline_generator/pipeline.ptmpl b/kfp/pipeline_generator/pipeline.ptmpl index b21d4f7a8..3b1ecaaac 100644 --- a/kfp/pipeline_generator/pipeline.ptmpl +++ b/kfp/pipeline_generator/pipeline.ptmpl @@ -22,7 +22,7 @@ base_kfp_image = "__kfp_base_image__" # path to kfp component specifications files component_spec_path = "__component_spec_path__" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( func=__compute_func_name__, base_image=base_kfp_image diff --git a/transforms/.make.workflows b/transforms/.make.workflows index d1a5fe82f..e40d3fb43 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -1,4 +1,6 @@ -include ${REPOROOT}/.make.versions +IGNORE := $(shell bash -c "sed -nr /^[^\#]*=/p ${REPOROOT}/.make.versions | sed 's/=/:=/' | sed 's/^/export /' > makeenv") + +include makeenv include ${REPOROOT}/kfp/requirements.env # Include the common rules. @@ -14,7 +16,8 @@ endef # FIXME .PHONY: .transforms_workflows.reconcile-requirements .transforms_workflows.reconcile-requirements: - + cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) reconcile-requirements + ${REPOROOT}/transforms/hack/update_workflow_tags.sh ${REPOROOT}/.make.versions ${PIPELINE_FILE} .PHONY: .transforms_workflows.compile-pipeline .transforms_workflows.compile-pipeline: diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 1428ce4c0..ee8d4d876 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -29,7 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/code/code_quality/ray/Makefile b/transforms/code/code_quality/ray/Makefile index c3102204d..7678ecd5e 100644 --- a/transforms/code/code_quality/ray/Makefile +++ b/transforms/code/code_quality/ray/Makefile @@ -6,7 +6,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=code_quality # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${CODE_QUALITY_VERSION} +DOCKER_IMAGE_VERSION=${CODE_QUALITY_RAY_VERSION} # Use default rule inherited from makefile.common clean:: .transforms.clean diff --git a/transforms/code/ingest_2_parquet/kfp_ray/v1/ingest_2_parquet_wf.py b/transforms/code/ingest_2_parquet/kfp_ray/v1/ingest_2_parquet_wf.py index d911d5218..a3008dbfa 100644 --- a/transforms/code/ingest_2_parquet/kfp_ray/v1/ingest_2_parquet_wf.py +++ b/transforms/code/ingest_2_parquet/kfp_ray/v1/ingest_2_parquet_wf.py @@ -32,7 +32,7 @@ # path to kfp component specifications files component_spec_path = "../../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image diff --git a/transforms/code/ingest_2_parquet/ray/Makefile b/transforms/code/ingest_2_parquet/ray/Makefile index 68b2efd3e..f23d1c518 100644 --- a/transforms/code/ingest_2_parquet/ray/Makefile +++ b/transforms/code/ingest_2_parquet/ray/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=ingest_2_parquet # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${INGEST_TO_PARQUET_VERSION} +DOCKER_IMAGE_VERSION=${INGEST_TO_PARQUET_RAY_VERSION} venv:: .transforms.ray-venv diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 507917219..fd7d49d7d 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -30,7 +30,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/code/malware/ray/Makefile b/transforms/code/malware/ray/Makefile index 06d86a0a7..748ef763c 100644 --- a/transforms/code/malware/ray/Makefile +++ b/transforms/code/malware/ray/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=malware # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${MALWARE_VERSION} +DOCKER_IMAGE_VERSION=${MALWARE_RAY_VERSION} OS := $(shell uname -s) ifeq ($(OS),Darwin) diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index bc9eeff84..27b52a891 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -31,7 +31,7 @@ component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/code/proglang_select/ray/Makefile b/transforms/code/proglang_select/ray/Makefile index 8f10a77a8..81216fb62 100644 --- a/transforms/code/proglang_select/ray/Makefile +++ b/transforms/code/proglang_select/ray/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=proglang_select # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${PROGLANG_SELECT_VERSION} +DOCKER_IMAGE_VERSION=${PROGLANG_SELECT_RAY_VERSION} venv:: .transforms.ray-venv diff --git a/transforms/hack/update_workflow_tags.sh b/transforms/hack/update_workflow_tags.sh new file mode 100755 index 000000000..14f1bb1c3 --- /dev/null +++ b/transforms/hack/update_workflow_tags.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -euo pipefail + +if [[ $# != 2 ]]; then + cat << EOF + "Incorrect number of parameters provided. The required parameters are versions_file and pipeline_path. +EOF + exit 1 +fi + +versions_file=$1 +pipeline_path=$2 + +# Modify the tasks tags as defined in the versions file +while IFS= read -r line; do + [ -z "$line" ] && continue + [[ $line == *#* ]] && continue + VERSION_NAME=$(echo $line |cut -d "=" -f 1) + DOCKER_IMAGE_NAME=$(echo $line |cut -d "=" -f 1 |sed "s/_VERSION//" |tr '[:upper:]' '[:lower:]') + DOCKER_IMAGE_NAME=$(echo $DOCKER_IMAGE_NAME |sed "s/_ray$/\-ray/" | sed "s/_spark$/\-spark/" | sed "s/_parquet$/\-parquet/") + DOCKER_IMAGE_VERSION=$(eval echo ${!VERSION_NAME}) + sed -i.back "s/data-prep-kit\/$DOCKER_IMAGE_NAME:.*/data-prep-kit\/$DOCKER_IMAGE_NAME:$DOCKER_IMAGE_VERSION\"/" $pipeline_path +done < $versions_file +# Update kfp component image tag +sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:$KFP_DOCKER_VERSION\"/" $pipeline_path diff --git a/transforms/language/lang_id/Makefile b/transforms/language/lang_id/Makefile index 7735c3251..41413c041 100644 --- a/transforms/language/lang_id/Makefile +++ b/transforms/language/lang_id/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/language/lang_id/kfp_ray/Makefile b/transforms/language/lang_id/kfp_ray/Makefile index fb56f30a7..fac8ac2dd 100644 --- a/transforms/language/lang_id/kfp_ray/Makefile +++ b/transforms/language/lang_id/kfp_ray/Makefile @@ -33,9 +33,7 @@ set-versions: workflow-reconcile-requirements .PHONY: workflow-build workflow-build: workflow-venv - @for file in $(YAML_WF); do \ - $(MAKE) $$file; \ - done + $(MAKE) $(YAML_WF) .PHONY: workflow-test workflow-test: workflow-build diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index a8ddc6b6e..e60e9b138 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -28,7 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/universal/doc_id/ray/Makefile b/transforms/universal/doc_id/ray/Makefile index fa945aa0b..b5867b6ef 100644 --- a/transforms/universal/doc_id/ray/Makefile +++ b/transforms/universal/doc_id/ray/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=doc_id # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${DOC_ID_VERSION} +DOCKER_IMAGE_VERSION=${DOC_ID_RAY_VERSION} venv:: .transforms.ray-venv diff --git a/transforms/universal/ededup/Makefile b/transforms/universal/ededup/Makefile index 3ddd55b7b..b3ff448f7 100644 --- a/transforms/universal/ededup/Makefile +++ b/transforms/universal/ededup/Makefile @@ -62,4 +62,5 @@ workflow-upload: $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/ededup/ray/Makefile b/transforms/universal/ededup/ray/Makefile index 8804803ee..bf79f20cb 100644 --- a/transforms/universal/ededup/ray/Makefile +++ b/transforms/universal/ededup/ray/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=ededup # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${EDEDUP_VERSION} +DOCKER_IMAGE_VERSION=${EDEDUP_RAY_VERSION} venv:: .transforms.ray-venv diff --git a/transforms/universal/fdedup/kfp_ray/Makefile b/transforms/universal/fdedup/kfp_ray/Makefile index e0c0af803..257a43345 100644 --- a/transforms/universal/fdedup/kfp_ray/Makefile +++ b/transforms/universal/fdedup/kfp_ray/Makefile @@ -27,6 +27,8 @@ image:: test-image:: +load-image:: + set-versions: workflow-reconcile-requirements .PHONY: workflow-build diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile index 7a524b9a7..19f2be284 100644 --- a/transforms/universal/fdedup/ray/Makefile +++ b/transforms/universal/fdedup/ray/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=fdedup # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${FDEDUP_VERSION} +DOCKER_IMAGE_VERSION=${FDEDUP_RAY_VERSION} venv:: .transforms.ray-venv diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index b398652f6..23c89a3a8 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -30,7 +30,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/universal/filter/ray/Makefile b/transforms/universal/filter/ray/Makefile index a740c9ce8..3d549dc86 100644 --- a/transforms/universal/filter/ray/Makefile +++ b/transforms/universal/filter/ray/Makefile @@ -9,7 +9,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=filter # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${FILTER_VERSION} +DOCKER_IMAGE_VERSION=${FILTER_RAY_VERSION} venv:: .transforms.ray-venv diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 5e39c3249..417a506c9 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -29,7 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 87a73059e..451fe31f7 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -29,7 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 21af91374..3b2ebac57 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -30,7 +30,7 @@ component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters. Here different tranforms might need different implementations. As +# compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( worker_options: str, diff --git a/transforms/universal/tokenization/python/Makefile b/transforms/universal/tokenization/python/Makefile index 9b11d8cdd..5a3851821 100644 --- a/transforms/universal/tokenization/python/Makefile +++ b/transforms/universal/tokenization/python/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=tokenization # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${TOKENIZATION_VERSION} +DOCKER_IMAGE_VERSION=${TOKENIZATION_PYTHON_VERSION} venv:: .transforms.python-venv diff --git a/transforms/universal/tokenization/ray/Makefile b/transforms/universal/tokenization/ray/Makefile index 79d62a765..dcf525ce6 100644 --- a/transforms/universal/tokenization/ray/Makefile +++ b/transforms/universal/tokenization/ray/Makefile @@ -8,7 +8,7 @@ include $(REPOROOT)/transforms/.make.transforms TRANSFORM_NAME=tokenization # $(REPOROOT)/.make.versions file contains the versions -DOCKER_IMAGE_VERSION=${TOKENIZATION_VERSION} +DOCKER_IMAGE_VERSION=${TOKENIZATION_RAY_VERSION} venv:: .transforms.ray-venv