From c6c2666694f2f8f67abfd86dedf384966e534267 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 03:26:04 -0400 Subject: [PATCH 1/8] Allow rebuilding partial list and relax requirements on duckdb and mmk3 Signed-off-by: Maroun Touma --- transforms/packaging/.make.packaging | 13 ++++- transforms/packaging/python/Makefile | 31 +++++----- .../packaging/python/requirements.all.txt | 56 +++++++++++++++++++ .../packaging/python/requirements.lang1.txt | 31 ++++++++++ transforms/packaging/python/requirements.txt | 31 ---------- 5 files changed, 117 insertions(+), 45 deletions(-) create mode 100644 transforms/packaging/python/requirements.all.txt create mode 100644 transforms/packaging/python/requirements.lang1.txt delete mode 100644 transforms/packaging/python/requirements.txt diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index 0ecc05484..bcb8ada8b 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -1,3 +1,7 @@ +ifndef T_SET +T_SET=all +endif + venv: $(MAKE) .defaults.create-venv @@ -11,6 +15,9 @@ image:: .transforms.python-image run-ut:: source venv/bin/activate; \ + if [ -e requirements.test.txt ]; then \ + $(PYTHON) -m pip install -r requirements.test.txt ; \ + fi; \ for T in $(TRANSFORMS_NAMES); do \ echo running unit test on: $$T ; \ $(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \ @@ -25,9 +32,13 @@ setup: .transforms.setup venv @# Help: Do any default transform setup before running make src and setting up a test environment +requirements: + cp requirements.$(T_SET).txt requirements.txt + src: mkdir src - for T in $(TRANSFORMS_NAMES); do \ + make requirements + for T in $(shell echo $(TRANSFORMS_NAMES)); do \ echo copy src from $$T ; \ cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ rm -fr *.egg-info ; \ diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index 1271a20c3..b1903ee54 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -12,15 +12,7 @@ include ../.make.packaging PACKAGING_RUN_TIME=python - -#Excluded List -# ./code/malware -# ./universal/html2parquet -# ./universal/profiler # Missing implementation -# ./universal/fdedup # Missing implementation -# code/repo_level_ordering # Missing implementation - - +ifeq ($(T_SET), all) TRANSFORMS_NAMES = code/code_quality \ code/code2parquet \ code/header_cleanser \ @@ -31,12 +23,25 @@ TRANSFORMS_NAMES = code/code_quality \ language/pdf2parquet \ language/pii_redactor \ language/text_encoder \ + language/html2parquet \ + universal/tokenization \ universal/ededup \ + /universal/doc_id \ universal/filter \ - universal/resize \ - universal/tokenization \ - universal/doc_id + universal/resize +endif +ifeq ($(T_SET), lang1) +TRANSFORMS_NAMES = language/doc_quality \ + language/lang_id \ + language/text_encoder \ + language/html2parquet \ + universal/tokenization \ + universal/ededup \ + /universal/doc_id \ + universal/filter \ + universal/resize +endif # distribution versions is the same as image version. set-versions: @@ -52,7 +57,7 @@ test-with-pypi: $(MAKE) clean $(MAKE) .defaults.create-venv source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRANSFORMS_VERSION) + $(PYTHON) -m pip install . $(MAKE) run-ut @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) diff --git a/transforms/packaging/python/requirements.all.txt b/transforms/packaging/python/requirements.all.txt new file mode 100644 index 000000000..8b96d3ce8 --- /dev/null +++ b/transforms/packaging/python/requirements.all.txt @@ -0,0 +1,56 @@ +#conflict to be resolved +#docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 +#trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" +data-prep-toolkit>=0.2.2.dev0 +# code quality +bs4==0.0.2 +transformers==4.38.2 +#pdf2parquet +# conflict with chunking.... +#docling-core==1.2.0, +docling-ibm-models==1.1.7, +deepsearch-glm==0.21.0, +docling==1.11.0, +filetype >=1.2.0, <2.0.0, +#Doc chunking +docling-core==1.3.0, +llama-index-core>=0.11.0,<0.12.0, +#filter +duckdb==0.10.1 +#langid +fasttext==0.9.2 +langcodes==3.3.0 +huggingface-hub >= 0.21.4, <1.0.0 +numpy==1.26.4 +#fdedup +mmh3>=4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +scipy>=1.12.0, <2.0.0 +# ededup +mmh3>=4.1.0 +xxhash==3.4.1 +#code2parquet +pandas +parameterized +#header cleanser +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' +#text_encoder +sentence-transformers==3.0.1 +# PII-redactor +presidio-analyzer>=2.2.355 +presidio-anonymizer>=2.2.355 +flair>=0.14.0 +pandas>=2.2.2 +#html2parquet +#INFO: pip is looking at multiple versions of trafilatura to determine which version is compatible with other requirements. This could take a while. +#The conflict is caused by: +# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 +# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" +# disable testing on Mac +trafilatura==1.12.0 ; platform_system != 'Darwin' +#tokenization +transformers==4.38.2 + + + diff --git a/transforms/packaging/python/requirements.lang1.txt b/transforms/packaging/python/requirements.lang1.txt new file mode 100644 index 000000000..40931b7b1 --- /dev/null +++ b/transforms/packaging/python/requirements.lang1.txt @@ -0,0 +1,31 @@ +#filter +duckdb>=0.10.1 +#langid +fasttext==0.9.2 +langcodes==3.3.0 +huggingface-hub >= 0.21.4, <1.0.0 +numpy==1.26.4 +#fdedup +mmh3>=4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +scipy==1.12.0 +# ededup +mmh3>=4.1.0, +xxhash==3.4.1 +#text_encoder +sentence-transformers>=3.0.1 +#html2parquet +trafilatura==1.12.0 +#tokenization +transformers==4.38.2 + +#ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. +#data-prep-toolkit-transforms 0.2.2.dev0 requires duckdb==0.10.1, but you have duckdb 1.1.0 which is incompatible. +#data-prep-toolkit-transforms 0.2.2.dev0 requires sentence-transformers==3.0.1, but you have sentence-transformers 3.1.1 which is incompatible. + + + + + + diff --git a/transforms/packaging/python/requirements.txt b/transforms/packaging/python/requirements.txt deleted file mode 100644 index 7bdbe9857..000000000 --- a/transforms/packaging/python/requirements.txt +++ /dev/null @@ -1,31 +0,0 @@ -data-prep-toolkit>=0.2.2.dev0 -bs4==0.0.2 -#pdf2parquet -# conflict with chunking.... -#docling-core==1.2.0, -docling-ibm-models==1.1.7, -deepsearch-glm==0.21.0, -docling==1.11.0, -filetype >=1.2.0, <2.0.0, -#Doc chunking -docling-core==1.3.0, -llama-index-core>=0.11.0,<0.12.0, -duckdb==0.10.1 -fasttext==0.9.2 -huggingface-hub >= 0.21.4, <1.0.0 -langcodes==3.3.0 -mmh3==4.1.0 -numpy==1.26.4 -pandas -parameterized -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -sentence-transformers==3.0.1 -transformers==4.38.2 -xxhash==3.4.1 -# PII-redactor -presidio-analyzer>=2.2.355 -presidio-anonymizer>=2.2.355 -flair>=0.14.0 -pandas>=2.2.2 - - From 58c0e1a82ad419d037c67ba6d97d5bba80064425 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 03:39:06 -0400 Subject: [PATCH 2/8] relax requirements on duckdb Signed-off-by: Maroun Touma --- transforms/packaging/python/requirements.all.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/packaging/python/requirements.all.txt b/transforms/packaging/python/requirements.all.txt index 8b96d3ce8..df113e756 100644 --- a/transforms/packaging/python/requirements.all.txt +++ b/transforms/packaging/python/requirements.all.txt @@ -16,7 +16,7 @@ filetype >=1.2.0, <2.0.0, docling-core==1.3.0, llama-index-core>=0.11.0,<0.12.0, #filter -duckdb==0.10.1 +duckdb>=0.10.1 #langid fasttext==0.9.2 langcodes==3.3.0 From 0e353e4b7a56424057239d613aec28e9e8fd9d37 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 04:31:42 -0400 Subject: [PATCH 3/8] set package name based on list of transforms in package Signed-off-by: Maroun Touma --- transforms/packaging/.make.packaging | 13 ++++++++++++- transforms/packaging/python/Makefile | 7 ++++++- transforms/packaging/python/pyproject.toml | 4 ++-- .../packaging/python/requirements.all.txt | 17 ++++++----------- .../packaging/python/requirements.lang1.txt | 1 + 5 files changed, 27 insertions(+), 15 deletions(-) diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index bcb8ada8b..34a6a8982 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -33,11 +33,22 @@ setup: .transforms.setup venv requirements: - cp requirements.$(T_SET).txt requirements.txt + if [ -e requirements.$(T_SET).txt ]; then \ + cp requirements.$(T_SET).txt requirements.txt ; \ + fi + +tag: + if [ $(TRANSFORM_PKG) ]; then \ + cat pyproject.toml | sed -e \ + 's/^name[ ]*=.*/name = "'${TRANSFORM_PKG}'"/' \ + > tt.toml; \ + mv tt.toml pyproject.toml; \ + fi src: mkdir src make requirements + make tag for T in $(shell echo $(TRANSFORMS_NAMES)); do \ echo copy src from $$T ; \ cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index b1903ee54..b747a40f8 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -13,6 +13,10 @@ include ../.make.packaging PACKAGING_RUN_TIME=python ifeq ($(T_SET), all) +# Cannot combine language/html2parquet with pdf2parquet due to: +#The conflict is caused by: +# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 +# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" TRANSFORMS_NAMES = code/code_quality \ code/code2parquet \ code/header_cleanser \ @@ -23,12 +27,12 @@ TRANSFORMS_NAMES = code/code_quality \ language/pdf2parquet \ language/pii_redactor \ language/text_encoder \ - language/html2parquet \ universal/tokenization \ universal/ededup \ /universal/doc_id \ universal/filter \ universal/resize +TRANSFORM_PKG = "data_prep_toolkit_transforms" endif ifeq ($(T_SET), lang1) @@ -41,6 +45,7 @@ TRANSFORMS_NAMES = language/doc_quality \ /universal/doc_id \ universal/filter \ universal/resize +TRANSFORM_PKG = "data_prep_toolkit_transforms_lang1" endif # distribution versions is the same as image version. diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index 37e4c93da..03d54a2f6 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -1,6 +1,6 @@ [project] -name = "data_prep_toolkit_transforms" -version = "0.2.2.dev0" +name = "data_prep_toolkit_transforms_lang1" +version = "0.2.2" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms" diff --git a/transforms/packaging/python/requirements.all.txt b/transforms/packaging/python/requirements.all.txt index df113e756..c1246fba9 100644 --- a/transforms/packaging/python/requirements.all.txt +++ b/transforms/packaging/python/requirements.all.txt @@ -1,17 +1,13 @@ -#conflict to be resolved -#docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 -#trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -data-prep-toolkit>=0.2.2.dev0 +data-prep-toolkit>=0.2.1 # code quality bs4==0.0.2 transformers==4.38.2 #pdf2parquet -# conflict with chunking.... -#docling-core==1.2.0, -docling-ibm-models==1.1.7, -deepsearch-glm==0.21.0, +docling-core==1.3.0 +docling-ibm-models==1.1.7 +deepsearch-glm==0.21.0 docling==1.11.0, -filetype >=1.2.0, <2.0.0, +filetype >=1.2.0, <2.0.0 #Doc chunking docling-core==1.3.0, llama-index-core>=0.11.0,<0.12.0, @@ -47,8 +43,7 @@ pandas>=2.2.2 #The conflict is caused by: # docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 # trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -# disable testing on Mac -trafilatura==1.12.0 ; platform_system != 'Darwin' +#trafilatura==1.12.0 #tokenization transformers==4.38.2 diff --git a/transforms/packaging/python/requirements.lang1.txt b/transforms/packaging/python/requirements.lang1.txt index 40931b7b1..1c7289f64 100644 --- a/transforms/packaging/python/requirements.lang1.txt +++ b/transforms/packaging/python/requirements.lang1.txt @@ -1,3 +1,4 @@ +data-prep-toolkit>=0.2.1 #filter duckdb>=0.10.1 #langid From b2cfd427a1bdefc867a2e851c1219dc416b3a8bb Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 06:32:39 -0400 Subject: [PATCH 4/8] use IS-PATCH flag to determine patch release vs dev tag Signed-off-by: Maroun Touma --- transforms/packaging/.make.packaging | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index 34a6a8982..9479748fc 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -37,7 +37,7 @@ requirements: cp requirements.$(T_SET).txt requirements.txt ; \ fi -tag: +pkg-name: if [ $(TRANSFORM_PKG) ]; then \ cat pyproject.toml | sed -e \ 's/^name[ ]*=.*/name = "'${TRANSFORM_PKG}'"/' \ @@ -45,10 +45,19 @@ tag: mv tt.toml pyproject.toml; \ fi +is-patch: + if [ $(IS_PATCH) ]; then \ + cat pyproject.toml | sed -e \ + 's/^version[ ]*=[ ]*"\(.*\).dev.*/version = "\1"/' \ + > tt.toml; \ + mv tt.toml pyproject.toml; \ + fi + src: mkdir src make requirements - make tag + make pkg-name + make is-patch for T in $(shell echo $(TRANSFORMS_NAMES)); do \ echo copy src from $$T ; \ cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ From 6ed5ffc08ad6fde9befe629b79a0d864c79e5846 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 06:34:06 -0400 Subject: [PATCH 5/8] Run test before and after publishing tp pypi using released version of the library/runtime Signed-off-by: Maroun Touma --- transforms/packaging/python/Makefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index b747a40f8..aae6f26e4 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -59,13 +59,20 @@ test-src:: @# Help: Do any default transform setup before running make src and setting up a test environment test-with-pypi: - $(MAKE) clean + $(MAKE) src $(MAKE) .defaults.create-venv source venv/bin/activate; \ $(PYTHON) -m pip install . $(MAKE) run-ut @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) +test-wheel: + $(MAKE) clean + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install $(TRANSFORM_PKG) + $(MAKE) run-ut + @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) From e74298b3f17e10e4a4a143155ff9ff2156e7bb82 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 06:50:51 -0400 Subject: [PATCH 6/8] improve help message Signed-off-by: Maroun Touma --- transforms/packaging/.make.packaging | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index 9479748fc..5268889d0 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -53,6 +53,9 @@ is-patch: mv tt.toml pyproject.toml; \ fi +##################################################### +# to build a patched release, use make IS_PATCH=1 src +##################################################### src: mkdir src make requirements @@ -65,7 +68,7 @@ src: rm -fr dist ; \ rm -fr build ; \ done; - @# Help: Setup src folder and remove old distribution + @# Help: Setup src folder and remove old distribution. to setup for a patched release use: make IS_PATCH=1 $@ build:: build-dist @@ -73,6 +76,7 @@ build:: build-dist publish:: publish-dist build-dist:: src .defaults.build-dist + @# Help: build the distribution for publishing to pypi. to build a patch release (no .devN) use: make IS_PATCH=1 $@ publish-dist:: .defaults.publish-dist From ba8f8199e7252fad9ee6c4641d0ba581d594682d Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 07:36:10 -0400 Subject: [PATCH 7/8] added target for testing before and after pushing patch to pypi Signed-off-by: Maroun Touma --- transforms/packaging/python/Makefile | 15 +++++++++++++-- transforms/packaging/python/README.md | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index aae6f26e4..6a0a355de 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -64,15 +64,26 @@ test-with-pypi: source venv/bin/activate; \ $(PYTHON) -m pip install . $(MAKE) run-ut - @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) + @# Help: Load dependencies from pypi and run all unit tests: final step in verification BEFORE deploying to pypi) + test-wheel: + -rm -fr venv + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install dist/*.whl + $(MAKE) run-ut + @# Help: Load wheel from local folder and run all unit tests + + + +test-latest-patch: $(MAKE) clean $(MAKE) .defaults.create-venv source venv/bin/activate; \ $(PYTHON) -m pip install $(TRANSFORM_PKG) $(MAKE) run-ut - @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) + @# Help: Load wheel from pypi and run all unit tests: final step in verification AFTER deploying to pypi) diff --git a/transforms/packaging/python/README.md b/transforms/packaging/python/README.md index 45260ce56..20eb0dff0 100644 --- a/transforms/packaging/python/README.md +++ b/transforms/packaging/python/README.md @@ -10,7 +10,7 @@ installing the python transforms will also install `data-prep-toolkit` ## List of Transforms in current package -Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components +Note: This list includes the transforms that were part of the release starting with data-prep-toolkit-transforms:0.2.1. This list may not always reflect up to date information. Users are encourage to raise an issue in git when they discover missing components or packages that are listed below but not in the current release they get from pypi. * code * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md) From ba24868047cfade604afbeac97988de0843ea3b7 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 29 Sep 2024 08:04:07 -0400 Subject: [PATCH 8/8] pyproject.toml follows set-versions. special releases are done by makefile locally Signed-off-by: Maroun Touma --- transforms/packaging/python/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index 03d54a2f6..37e4c93da 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -1,6 +1,6 @@ [project] -name = "data_prep_toolkit_transforms_lang1" -version = "0.2.2" +name = "data_prep_toolkit_transforms" +version = "0.2.2.dev0" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms"