Skip to content

Commit

Permalink
Merge pull request #640 from IBM/test-conflicts
Browse files Browse the repository at this point in the history
Test and address conflicts when using the transforms package in a language application with specific requirements.txt file
  • Loading branch information
touma-I authored Sep 30, 2024
2 parents 4b59c9b + ba24868 commit 83cec8f
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 49 deletions.
39 changes: 37 additions & 2 deletions transforms/packaging/.make.packaging
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ifndef T_SET
T_SET=all
endif


venv:
$(MAKE) .defaults.create-venv
Expand All @@ -11,6 +15,9 @@ image:: .transforms.python-image

run-ut::
source venv/bin/activate; \
if [ -e requirements.test.txt ]; then \
$(PYTHON) -m pip install -r requirements.test.txt ; \
fi; \
for T in $(TRANSFORMS_NAMES); do \
echo running unit test on: $$T ; \
$(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \
Expand All @@ -25,23 +32,51 @@ setup: .transforms.setup venv
@# Help: Do any default transform setup before running make src and setting up a test environment


requirements:
if [ -e requirements.$(T_SET).txt ]; then \
cp requirements.$(T_SET).txt requirements.txt ; \
fi

pkg-name:
if [ $(TRANSFORM_PKG) ]; then \
cat pyproject.toml | sed -e \
's/^name[ ]*=.*/name = "'${TRANSFORM_PKG}'"/' \
> tt.toml; \
mv tt.toml pyproject.toml; \
fi

is-patch:
if [ $(IS_PATCH) ]; then \
cat pyproject.toml | sed -e \
's/^version[ ]*=[ ]*"\(.*\).dev.*/version = "\1"/' \
> tt.toml; \
mv tt.toml pyproject.toml; \
fi

#####################################################
# to build a patched release, use make IS_PATCH=1 src
#####################################################
src:
mkdir src
for T in $(TRANSFORMS_NAMES); do \
make requirements
make pkg-name
make is-patch
for T in $(shell echo $(TRANSFORMS_NAMES)); do \
echo copy src from $$T ; \
cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \
rm -fr *.egg-info ; \
rm -fr dist ; \
rm -fr build ; \
done;
@# Help: Setup src folder and remove old distribution
@# Help: Setup src folder and remove old distribution. to setup for a patched release use: make IS_PATCH=1 $@


build:: build-dist

publish:: publish-dist

build-dist:: src .defaults.build-dist
@# Help: build the distribution for publishing to pypi. to build a patch release (no .devN) use: make IS_PATCH=1 $@

publish-dist:: .defaults.publish-dist

Expand Down
58 changes: 43 additions & 15 deletions transforms/packaging/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,11 @@ include ../.make.packaging

PACKAGING_RUN_TIME=python


#Excluded List
# ./code/malware
# ./universal/html2parquet
# ./universal/profiler # Missing implementation
# ./universal/fdedup # Missing implementation
# code/repo_level_ordering # Missing implementation


ifeq ($(T_SET), all)
# Cannot combine language/html2parquet with pdf2parquet due to:
#The conflict is caused by:
# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
TRANSFORMS_NAMES = code/code_quality \
code/code2parquet \
code/header_cleanser \
Expand All @@ -31,12 +27,26 @@ TRANSFORMS_NAMES = code/code_quality \
language/pdf2parquet \
language/pii_redactor \
language/text_encoder \
universal/tokenization \
universal/ededup \
/universal/doc_id \
universal/filter \
universal/resize \
universal/tokenization \
universal/doc_id
universal/resize
TRANSFORM_PKG = "data_prep_toolkit_transforms"
endif

ifeq ($(T_SET), lang1)
TRANSFORMS_NAMES = language/doc_quality \
language/lang_id \
language/text_encoder \
language/html2parquet \
universal/tokenization \
universal/ededup \
/universal/doc_id \
universal/filter \
universal/resize
TRANSFORM_PKG = "data_prep_toolkit_transforms_lang1"
endif

# distribution versions is the same as image version.
set-versions:
Expand All @@ -49,13 +59,31 @@ test-src::
@# Help: Do any default transform setup before running make src and setting up a test environment

test-with-pypi:
$(MAKE) clean
$(MAKE) src
$(MAKE) .defaults.create-venv
source venv/bin/activate; \
$(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRANSFORMS_VERSION)
$(PYTHON) -m pip install .
$(MAKE) run-ut
@# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi)
@# Help: Load dependencies from pypi and run all unit tests: final step in verification BEFORE deploying to pypi)


test-wheel:
-rm -fr venv
$(MAKE) .defaults.create-venv
source venv/bin/activate; \
$(PYTHON) -m pip install dist/*.whl
$(MAKE) run-ut
@# Help: Load wheel from local folder and run all unit tests



test-latest-patch:
$(MAKE) clean
$(MAKE) .defaults.create-venv
source venv/bin/activate; \
$(PYTHON) -m pip install $(TRANSFORM_PKG)
$(MAKE) run-ut
@# Help: Load wheel from pypi and run all unit tests: final step in verification AFTER deploying to pypi)



2 changes: 1 addition & 1 deletion transforms/packaging/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ installing the python transforms will also install `data-prep-toolkit`

## List of Transforms in current package

Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components
Note: This list includes the transforms that were part of the release starting with data-prep-toolkit-transforms:0.2.1. This list may not always reflect up to date information. Users are encourage to raise an issue in git when they discover missing components or packages that are listed below but not in the current release they get from pypi.

* code
* [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md)
Expand Down
51 changes: 51 additions & 0 deletions transforms/packaging/python/requirements.all.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
data-prep-toolkit>=0.2.1
# code quality
bs4==0.0.2
transformers==4.38.2
#pdf2parquet
docling-core==1.3.0
docling-ibm-models==1.1.7
deepsearch-glm==0.21.0
docling==1.11.0,
filetype >=1.2.0, <2.0.0
#Doc chunking
docling-core==1.3.0,
llama-index-core>=0.11.0,<0.12.0,
#filter
duckdb>=0.10.1
#langid
fasttext==0.9.2
langcodes==3.3.0
huggingface-hub >= 0.21.4, <1.0.0
numpy==1.26.4
#fdedup
mmh3>=4.1.0
xxhash==3.4.1
tqdm==4.66.3
scipy>=1.12.0, <2.0.0
# ededup
mmh3>=4.1.0
xxhash==3.4.1
#code2parquet
pandas
parameterized
#header cleanser
scancode-toolkit==32.1.0 ; platform_system != 'Darwin'
#text_encoder
sentence-transformers==3.0.1
# PII-redactor
presidio-analyzer>=2.2.355
presidio-anonymizer>=2.2.355
flair>=0.14.0
pandas>=2.2.2
#html2parquet
#INFO: pip is looking at multiple versions of trafilatura to determine which version is compatible with other requirements. This could take a while.
#The conflict is caused by:
# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
#trafilatura==1.12.0
#tokenization
transformers==4.38.2



32 changes: 32 additions & 0 deletions transforms/packaging/python/requirements.lang1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
data-prep-toolkit>=0.2.1
#filter
duckdb>=0.10.1
#langid
fasttext==0.9.2
langcodes==3.3.0
huggingface-hub >= 0.21.4, <1.0.0
numpy==1.26.4
#fdedup
mmh3>=4.1.0
xxhash==3.4.1
tqdm==4.66.3
scipy==1.12.0
# ededup
mmh3>=4.1.0,
xxhash==3.4.1
#text_encoder
sentence-transformers>=3.0.1
#html2parquet
trafilatura==1.12.0
#tokenization
transformers==4.38.2

#ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
#data-prep-toolkit-transforms 0.2.2.dev0 requires duckdb==0.10.1, but you have duckdb 1.1.0 which is incompatible.
#data-prep-toolkit-transforms 0.2.2.dev0 requires sentence-transformers==3.0.1, but you have sentence-transformers 3.1.1 which is incompatible.






31 changes: 0 additions & 31 deletions transforms/packaging/python/requirements.txt

This file was deleted.

0 comments on commit 83cec8f

Please sign in to comment.