Skip to content

Commit

Permalink
Merge pull request #622 from IBM/v0.2.1
Browse files Browse the repository at this point in the history
Pull request source branch to cut release 0.2.1- All test have passed.
  • Loading branch information
touma-I authored Sep 25, 2024
2 parents 877acab + 446b45a commit 3236911
Show file tree
Hide file tree
Showing 90 changed files with 242 additions and 212 deletions.
2 changes: 1 addition & 1 deletion .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2
DPK_MICRO_VERSION=1
# The suffix is generally always set in the main/development branch and only nulled out when creating release branches.
# It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi.
DPK_VERSION_SUFFIX=.dev3
DPK_VERSION_SUFFIX=

DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX)

Expand Down
4 changes: 2 additions & 2 deletions data-processing-lib/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
description = "Data Preparation Toolkit Library"
Expand Down Expand Up @@ -41,7 +41,7 @@ dev = [
]

[options]
package_dir = ["src","test"]
package_dir = ["src"]

[options.packages.find]
where = ["src/data_processing"]
Expand Down
6 changes: 3 additions & 3 deletions data-processing-lib/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_ray"
version = "0.2.1.dev3"
version = "0.2.1"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10"
description = "Data Preparation Toolkit Library for Ray"
Expand All @@ -11,7 +11,7 @@ authors = [
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
]
dependencies = [
"data-prep-toolkit>=0.2.1.dev3",
"data-prep-toolkit>=0.2.1",
"ray[default]==2.24.0",
# These two are to fix security issues identified by quay.io
"fastapi>=0.110.2",
Expand Down Expand Up @@ -42,7 +42,7 @@ dev = [
]

[options]
package_dir = ["src","test"]
package_dir = ["src"]

[options.packages.find]
where = ["src/data_processing_ray"]
Expand Down
1 change: 0 additions & 1 deletion data-processing-lib/spark/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ include $(REPOROOT)/.make.defaults
SPARK_VERSION=3.5.2
DOCKER_IMAGE_NAME=data-prep-kit-spark-$(SPARK_VERSION)
DOCKER_IMAGE_LIB_NAME=data-prep-kit-spark
DOCKER_IMAGE_VERSION := latest


.check-env::
Expand Down
4 changes: 2 additions & 2 deletions data-processing-lib/spark/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_spark"
version = "0.2.1.dev3"
version = "0.2.1"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10"
description = "Data Preparation Toolkit Library for Spark"
Expand All @@ -11,7 +11,7 @@ authors = [
{ name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev3",
"data-prep-toolkit==0.2.1",
"pyspark>=3.5.2",
"psutil>=6.0.0"
]
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/createRayClusterComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/deleteRayClusterComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/executeRayJobComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ outputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists, and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v1"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10,<3.12"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -13,7 +13,7 @@ authors = [
]
dependencies = [
"kfp==1.8.22",
"data-prep-toolkit-kfp-shared==0.2.1.dev3",
"data-prep-toolkit-kfp-shared==0.2.1",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v2"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10,<3.12"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"kfp==2.8.0",
"kfp-kubernetes==1.2.0",
"data-prep-toolkit-kfp-shared==0.2.1.dev3",
"data-prep-toolkit-kfp-shared==0.2.1",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_shared"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10,<3.12"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"requests",
"kubernetes",
"data-prep-toolkit-ray==0.2.1.dev3",
"data-prep-toolkit-ray==0.2.1",
]

[build-system]
Expand Down
16 changes: 8 additions & 8 deletions kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
run_fuzzy_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_tokenization_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")

code_to_parquet_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:latest"
proglang_select_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest"
code_quality_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"
malware_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest"
doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest"
tokenizer_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest"
code_to_parquet_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:0.2.1"
proglang_select_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.2.1"
code_quality_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.2.1"
malware_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.2.1"
doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.2.1"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.2.1"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.2.1"
tokenizer_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:0.2.1"


# Pipeline to invoke execution on remote resource
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
run_exact_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_fuzzy_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")

doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest"
doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.2.1"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.2.1"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.2.1"

# Pipeline to invoke execution on remote resource
@dsl.pipeline(
Expand Down
31 changes: 31 additions & 0 deletions release-notes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,36 @@
# Data Prep Kit Release notes

## Release 0.2.1 - 9/24/2024

### General
1. Bug fixes across the repo
1. Added AI Alliance RAG demo, tutorials and notebooks and tips for running on google colab
1. Added new transforms and single package for transforms published to pypi
1. Improved CI/CD with targeted workflow triggered on specific changes to specific modules
1. New enhancements for cutting a release


### data-prep-toolkit libraries (python, ray, spark)

1. Restructure the repository to distinguish/separate runtime libraries
1. Split data-processing-lib/ray into python and ray
1. Spark runtime
1. Updated pyarrow version
1. Define required transform() method as abstract to AbstractTableTransform
1. Enables configuration of makefile to use src or pypi for data-prep-kit library dependencies


### KFP Workloads

1. Add a configurable timeout before destroying the deployed Ray cluster.

### Transforms

1. Added 7 new transdforms including: language identification, profiler, repo level ordering, doc quality, pdf2parquet, HTML2Parquet and PII Transform
1. Added ededup python implementation and incremental ededup
1. Added fuzzy floating point comparison


## Release 0.2.0 - 6/27/2024

### General
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
# the name of the job script
EXEC_SCRIPT_NAME: str = "code2parquet_transform_ray.py"

task_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:latest"
task_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:0.2.1"


# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
6 changes: 3 additions & 3 deletions transforms/code/code2parquet/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code2parquet_transform_python"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "code2parquet Python Transform"
license = {text = "Apache-2.0"}
Expand All @@ -10,7 +10,7 @@ authors = [
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev3",
"data-prep-toolkit==0.2.1",
"parameterized",
"pandas",
]
Expand All @@ -33,7 +33,7 @@ dev = [
]

[options]
package_dir = ["src","test"]
package_dir = ["src"]

[options.packages.find]
where = ["src/"]
Expand Down
8 changes: 4 additions & 4 deletions transforms/code/code2parquet/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code2parquet_transform_ray"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "code2parquet Ray Transform"
license = {text = "Apache-2.0"}
Expand All @@ -10,8 +10,8 @@ authors = [
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
]
dependencies = [
"data-prep-toolkit-ray==0.2.1.dev3",
"dpk-code2parquet-transform-python==0.2.1.dev3",
"data-prep-toolkit-ray==0.2.1",
"dpk-code2parquet-transform-python==0.2.1",
"parameterized",
"pandas",
]
Expand All @@ -34,7 +34,7 @@ dev = [
]

[options]
package_dir = ["src","test"]
package_dir = ["src"]

[options.packages.find]
where = ["src/"]
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/code_quality/kfp_ray/code_quality_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
EXEC_SCRIPT_NAME: str = "code_quality_transform_ray.py"
PREFIX: str = ""

task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"
task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.2.1"

# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
6 changes: 3 additions & 3 deletions transforms/code/code_quality/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code_quality_transform_python"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "Code Quality Python Transform"
license = {text = "Apache-2.0"}
Expand All @@ -9,7 +9,7 @@ authors = [
{ name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev3",
"data-prep-toolkit==0.2.1",
"bs4==0.0.2",
"transformers==4.38.2",
]
Expand All @@ -32,7 +32,7 @@ dev = [
]

[options]
package_dir = ["src","test"]
package_dir = ["src"]

[options.packages.find]
where = ["src/"]
Expand Down
8 changes: 4 additions & 4 deletions transforms/code/code_quality/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code_quality_transform_ray"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "Code Quality Ray Transform"
license = {text = "Apache-2.0"}
Expand All @@ -9,8 +9,8 @@ authors = [
{ name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
]
dependencies = [
"dpk-code-quality-transform-python==0.2.1.dev3",
"data-prep-toolkit-ray==0.2.1.dev3",
"dpk-code-quality-transform-python==0.2.1",
"data-prep-toolkit-ray==0.2.1",
]

[build-system]
Expand All @@ -31,7 +31,7 @@ dev = [
]

[options]
package_dir = ["src","test"]
package_dir = ["src"]

[options.packages.find]
where = ["src/"]
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
EXEC_SCRIPT_NAME: str = "header_cleanser_transform_ray.py"
PREFIX: str = ""

task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest"
task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:0.2.1"

# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
Loading

0 comments on commit 3236911

Please sign in to comment.