Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split noop ray transform into ray and python runtimes. #221

Merged
merged 21 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
3a329cc
Split noop ray transform into ray and python runtimes.
daw3rd May 31, 2024
c2f7342
Fix type on wheel name of noop/python/pyproject.toml
daw3rd May 31, 2024
c6464e1
Change name of transform file to noop_transform_ray.py in noop kfp wo…
daw3rd Jun 1, 2024
a60527f
Remove unused *split* targets from transforms/.make.transforms.
daw3rd Jun 1, 2024
1c2f7e1
Update .transforms_workflows.reconcile-requirements
revit13 Jun 2, 2024
0945463
Rename .make.transforms_workflows.
revit13 Jun 2, 2024
7764015
Update all transforms to align with new runtime-naming conventions
daw3rd Jun 3, 2024
0e1f17e
Trim down the number of ray and spark test fixtures as they are alrea…
daw3rd Jun 3, 2024
31d168b
Rename defaults.publish target to defaults.publish-image
daw3rd Jun 3, 2024
510d661
Begin adding default publshing and build targets in prep for publishi…
daw3rd Jun 3, 2024
a1d21cc
Get noop ray and python runtime distributions publishing to pypi.
daw3rd Jun 3, 2024
d2d3abb
Add publishing of noop spark runtime
daw3rd Jun 3, 2024
b6ba585
Reconcile and set versions on noop kfp and kfp components.
daw3rd Jun 3, 2024
0ececdb
Reset version number in spark lib
daw3rd Jun 4, 2024
60bad03
UPdate more kfp versions using make set-versions
daw3rd Jun 4, 2024
4f2af56
Add set-versions target to all makefiles.
daw3rd Jun 4, 2024
7bea08b
Merge branch 'dev' into runtime-reorg
daw3rd Jun 4, 2024
1fffb8e
Fix noop/spark/Dockerfile - temp lib location was moved in the last c…
daw3rd Jun 4, 2024
d9a0d3b
Merge branch 'runtime-reorg' of github.com:IBM/data-prep-kit into run…
daw3rd Jun 4, 2024
5e21dc8
Two more spark dockerfiles for changed path to lib.
daw3rd Jun 4, 2024
8521cec
Fix image version in noop kfp v1 *.py
daw3rd Jun 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ celerybeat.pid
.env
.venv
env/
venv/
**/venv/
ENV/
env.bak/
venv.bak/
**/venv.bak/

# Spyder project settings
.spyderproject
Expand Down Expand Up @@ -141,4 +141,4 @@ dmypy.json
**/*.yaml

# Ignore VSCode folder
.vscode/
.vscode/
166 changes: 123 additions & 43 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ __check_defined = \
# Left over python stuff
-find . -name '*.egg-info' | xargs rm -rf
-find . -name '__pycache__' | xargs rm -rf
-rm -rf dist

# We create both local and remote tags. Local seems to be needed when using our spark
# base image. Remote seems to be needed by kfp.
Expand Down Expand Up @@ -207,51 +208,73 @@ __check_defined = \
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
rm -rf data-processing-lib-python

# Build and image using the local Dockerfile and make the data-processing-lib/ray
# Build an image using the local Dockerfile and make the data-processing-lib/ray
# available in the current directory for use by the Dockerfile (i.e. to install the library).
# Note that this looks for the ../python directory, which is currently only used in the transform projects,
# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms.
.PHONY: .defaults.ray-lib-src-image
.defaults.ray-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
$(MAKE) LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray .defaults.copy-lib
$(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
$(MAKE) LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray .defaults.copy-lib
if [ -e ../python ]; then \
$(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
fi
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
rm -rf data-processing-lib-python
rm -rf data-processing-lib-ray
rm -rf python-transform

# Note that this looks for the ../python directory, which is currently only used in the transform projects,
# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms.
.PHONY: .defaults.spark-lib-src-image
.defaults.spark-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Spark $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
$(MAKE) IMAGE_NAME_TO_VERIFY=$(DOCKER_SPARK_BASE_IMAGE_NAME) .defaults.verify-image-availability
$(MAKE) LIB_PATH=$(DPK_SPARK_LIB_DIR) LIB_NAME=data-processing-spark .defaults.copy-lib
$(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
$(MAKE) LIB_PATH=$(DPK_SPARK_LIB_DIR) LIB_NAME=data-processing-lib-spark .defaults.copy-lib
if [ -e ../python ]; then \
$(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
fi
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) .defaults.image
rm -rf data-processing-lib-python
rm -rf data-processing-spark
rm -rf data-processing-lib-spark
rm -rf python-transform

# Install the source from the given directory into an existing venv
# Expected PYTHON_PROJECT_DIR and uses EXTRA_INDEX_URL if set.
# PROJECT_DIR is expected to have src and pyproject.toml
.PHONY: .defaults.install-src-venv
.defaults.install-src-venv::
@# Help: Install Ray and Python data processing library source into existing venv
@echo Installing Ray and Python data processing library source to existing venv
@source venv/bin/activate; \
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
pip install $${extra_url} -e $(PYTHON_PROJECT_DIR);
@echo Installed python project source in $(PYTHON_PROJECT_DIR) into venv

# Install local requirements last as it generally includes our lib source
.PHONY: .defaults.python-lib-src-venv
.defaults.python-lib-src-venv:: .defaults.venv .defaults.install-python-lib-src-venv
.defaults.python-lib-src-venv:: .defaults.create-venv .defaults.install-python-lib-src-venv .defaults.install-local-requirements-venv

# Install all source from the repo for a python runtime transform into an existing venv
.PHONY: .defaults.install-python-lib-src-venv
.defaults.install-python-lib-src-venv::
@# Help: Install Python data processing library source into existing venv
@echo Installing Python data processing library source to existing venv
@source venv/bin/activate; \
pip install pytest; \
pip uninstall -y data-prep-toolkit; \
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
pip install $${extra_url} -e $(DPK_PYTHON_LIB_DIR); \
if [ $$? -eq 0 ]; then \
echo Installed source from Python processing library for `which $(PYTHON)`; \
else \
echo ERROR installing source into `which $(PYTHON)`; \
fi
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv; \
echo Installed source from Python processing library for `which $(PYTHON)`

# Install local requirements last as it generally includes our lib source
.PHONY: .defaults.ray-lib-src-venv
.defaults.ray-lib-src-venv:: .defaults.venv .defaults.install-ray-lib-src-venv
.defaults.ray-lib-src-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv .defaults.install-local-requirements-venv

# Install the python-based lib BEFORE spark assuming spark depends on the same version as python source.
# Install all source from the repo for a ray runtime transform into an existing venv
.PHONY: .defaults.install-ray-lib-src-venv
.defaults.install-ray-lib-src-venv::
@# Help: Install Ray and Python data processing library source into existing venv
Expand All @@ -260,39 +283,27 @@ __check_defined = \
pip install pytest; \
pip uninstall -y data-prep-toolkit; \
pip uninstall -y data-prep-toolkit-ray; \
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
pip install $${extra_url} -e $(DPK_PYTHON_LIB_DIR); \
pip install $${extra_url} -e $(DPK_RAY_LIB_DIR); \
if [ $$? -eq 0 ]; then \
echo Installed source from Ray data processing library for `which $(PYTHON)`; \
else \
echo ERROR installing source into `which $(PYTHON)`; \
fi
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_RAY_LIB_DIR) .defaults.install-src-venv; \
echo Installed source from Ray data processing library for `which $(PYTHON)`

# Install local requirements last as it generally includes our lib source
.PHONY: .defaults.spark-lib-src-venv
.defaults.spark-lib-src-venv:: .defaults.venv .defaults.install-spark-lib-src-venv
.defaults.spark-lib-src-venv:: .defaults.create-venv .defaults.install-spark-lib-src-venv .defaults.install-local-requirements-venv

# Install the python-based lib BEFORE spark assuming spark depends on the same version as python source.
# Install all source from the repo for a spark runtime transform into an existing venv
.PHONY: .defaults.install-spark-lib-src-venv
.defaults.install-spark-lib-src-venv::
@# Help: Install Spark and Python data processing library source into existing venv
@echo ""
@echo Installing Spark and Python data processing library source to existing venv
@source venv/bin/activate; \
pip install pytest; \
pip uninstall -y data-prep-toolkit; \
pip uninstall -y data-prep-toolkit-spark; \
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
pip install $${extra_url} -e $(DPK_PYTHON_LIB_DIR); \
pip install $${extra_url} -e $(DPK_SPARK_LIB_DIR); \
if [ $$? -eq 0 ]; then \
echo Installed source from Spark processing library for `which $(PYTHON)`; \
else \
echo ERROR installing source into `which $(PYTHON)`; \
fi
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_SPARK_LIB_DIR) .defaults.install-src-venv; \
echo Installed source from Spark processing library for `which $(PYTHON)`

.PHONY: .defaults.test-src
.defaults.test-src::
Expand Down Expand Up @@ -342,10 +353,10 @@ __check_defined = \
cd src; \
python $(RUN_FILE) $(RUN_ARGS)

# This expects the image to already be built and so does not depending on .defaults.publish.
# This expects the image to already be built and so does not depending on .defaults.publish-image.
# This allows others to define their own image building prior to publishing.
.PHONY: .defaults.publish
.defaults.publish::
.PHONY: .defaults.publish-image
.defaults.publish-image::
@# Help: Publish the $(DOCKER_LOCAL_IMAGE) to $(DOCKER_HOSTNAME) container registry
-$(DOCKER) logout $(DOCKER_HOSTNAME)
$(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
Expand All @@ -357,17 +368,29 @@ __check_defined = \
# We use "pip" instead of "$(PIP)" below because otherwise if the user has overriddent PYTHON
# they will end up installing into that PYTHON and NOT the venv.
.PHONY: .defaults.venv
.defaults.venv:: .check_python_version
@# Help: Create the virtual environment using requirements.txt
.defaults.venv:: .check_python_version .defaults.create-venv .defaults.install-local-requirements-venv

.PHONY: .defaults.create-venv
.defaults.create-venv:
@# Help: Create the virtual environment using requirements.txt or pyproject.toml
$(PYTHON) -m venv venv
@source venv/bin/activate; \
pip install --upgrade pip; \
pip install wheel; \

# Install requirements defined in the current directory into an existing venv
.PHONY: .defaults.install-local-requirements-venv
.defaults.install-local-requirements-venv:
@source venv/bin/activate; \
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
if [ -e requirements.txt ]; then \
echo Install requirements from requirements.txt; \
pip install $$extra_url -r requirements.txt; \
elif [ -e pypproject.toml ]; then \
echo Install requirements using pyproject.toml; \
pip install $$extra_url -e .; \
fi

.PHONY: .defaults.check.installed
Expand Down Expand Up @@ -433,3 +456,60 @@ MINIO_ADMIN_PWD= localminiosecretkey
-mc mb local/test > /dev/null 2>&1 # Igore if it already exists
mc cp --recursive $(MINIO_SRC) local/test/$(MINIO_DEST)


# Changes the version field of the pyproject.toml file to the given version
# and update the referenced library versions as defined in .make.versions.
# Expects TOML_VERSION
.PHONY: .defaults.update-toml
.defaults.update-toml:
$(MAKE) TOML_VERSION=$(TOML_VERSION) .defaults.set-toml-version
$(MAKE) .defaults.update-toml-lib-dep-versions

# Changes the version field of the pyproject.toml file to the given version
# Expects TOML_VERSION
.PHONY: .defaults.set-toml-version
.defaults.set-toml-version:
@# Help: Set the version= field of pyproject.toml
cat pyproject.toml | sed -e \
's/^version[ ]*=.*/version = "'${TOML_VERSION}'"/' \
> tt.toml
mv tt.toml pyproject.toml

# Updates the versions references to our repo source as defined in .make.versions
.PHONY: .defaults.update-toml-lib-dep-versions
.defaults.update-toml-lib-dep-versions:
@# Help: Update pyproject.toml to depend on lib versions defined in .make.versions
cat pyproject.toml | sed \
-e 's/"data-prep-toolkit-ray\(..\).*",/"data-prep-toolkit-ray\1$(DPK_LIB_VERSION)",/' \
-e 's/"data-prep-toolkit-spark\(..\).*",/"data-prep-toolkit-spark\1$(DPK_LIB_VERSION)",/' \
-e 's/"data-prep-toolkit-kfp\(..\).*",/"data-prep-toolkit-spark\1$(DPK_LIB_KFP_VERSION)",/' \
-e 's/"data-prep-toolkit\([=><][=><]\).*",/"data-prep-toolkit\1$(DPK_LIB_VERSION)",/' \
> tt.toml
mv tt.toml pyproject.toml

# Build the distribution, usually in preparation for publishing using ith the .defaults.publish-dist target
.PHONY: .defaults.build-dist
.defaults.build-dist :
@# Help: Build the distribution for publishing to pypi
@if [ ! -e pyproject.toml ]; then \
echo ERROR: Building a distribution requires a local pyproject.toml file; \
exit 1; \
fi
rm -rf dist || true
rm -rf src/*egg-info || true
${PIP} install --upgrade build
${PYTHON} -m build

# Publish the distribution in the dist directory, usually created with .defaults.build-dist target
.PHONY: .defaults.publish-dist
.defaults.publish-dist :
@# Help: Publish existing project distribution to pypi
@if [ ! -e dist ]; then \
echo ERROR: Publishing a distribution requires a local dist directory. Did you build?; \
exit 1; \
fi
${PYTHON} -m twine check dist/*
${PYTHON} -m twine upload --verbose --non-interactive dist/*
#@echo "create a git tag to reference published version"
#@git tag ${TAG}
#@git push origin ${TAG}
49 changes: 28 additions & 21 deletions .make.versions
Original file line number Diff line number Diff line change
@@ -1,31 +1,38 @@
################################################################################
# Here we attempt to capture/define all the version numbers used across the
# repository in Makefile format. These are generally considered the version
# numbers to be published on the next release/publishing of artifacts.
# numbers TO BE published on the NEXT release/publishing of artifacts.
################################################################################

# do
# % make RELEASE_VERSION_SUFFIX= set-version
# % git push, tag, etc.
# % make build, publish, etc.
RELEASE_VERSION_SUFFIX=.dev6

# Data prep lab wheel version
DPK_LIB_VERSION=0.2.0
DPK_LIB_KFP_VERSION=0.2.0
DPK_LIB_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX)
DPK_LIB_KFP_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX)

# Begin transform versions/tags
BLOCKLIST_VERSION=0.4.0
DOC_ID_VERSION=0.4.0
DOC_ID_SPARK_VERSION=0.2.0
EDEDUP_VERSION=0.4.0
FDEDUP_VERSION=0.4.0
FILTER_VERSION=0.4.0
FILTER_SPARK_VERSION=0.2.0
NOOP_VERSION=0.9.0
NOOP_SPARK_VERSION=0.2.0
RESIZE_VERSION=0.4.0
LANG_ID_VERSION=0.4.0
TOKENIZATION_VERSION=0.4.0
MALWARE_VERSION=0.5.0
PROGLANG_SELECT_VERSION=0.4.0
CODE_QUALITY_VERSION=0.4.0
DOC_QUALITY_VERSION=0.4.0
INGEST_TO_PARQUET_VERSION=0.4.0
BLOCKLIST_VERSION=0.4.2$(RELEASE_VERSION_SUFFIX)
DOC_ID_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
DOC_ID_SPARK_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX)
EDEDUP_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
FDEDUP_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
FILTER_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
FILTER_SPARK_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX)
NOOP_PYTHON_VERSION=0.9.0$(RELEASE_VERSION_SUFFIX)
NOOP_RAY_VERSION=0.9.0$(RELEASE_VERSION_SUFFIX)
NOOP_SPARK_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX)
RESIZE_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
LANG_ID_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
TOKENIZATION_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
MALWARE_VERSION=0.5.0$(RELEASE_VERSION_SUFFIX)
PROGLANG_SELECT_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
CODE_QUALITY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
DOC_QUALITY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
INGEST_TO_PARQUET_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)


KFP_DOCKER_VERSION=0.2.0
KFP_DOCKER_VERSION=0.2.0$(RELEASE_VERSION_SUFFIX)
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ test::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

lib-release:
@# Help: Publish data-prep-kit $(DPK_LIB_VERSION) and data-prep-kit-kfp $(DPK_LIB_KFP_VERSION) libraries to pypi
@$(MAKE) -C $(DPK_PYTHON_LIB_DIR) build publish
Expand All @@ -54,3 +62,4 @@ lib-release:
@echo ""



8 changes: 8 additions & 0 deletions data-processing-lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,11 @@ image::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

Loading