diff --git a/yggdrasil_decision_forests/port/python/CHANGELOG.md b/yggdrasil_decision_forests/port/python/CHANGELOG.md index 11a4f982..4d40975d 100644 --- a/yggdrasil_decision_forests/port/python/CHANGELOG.md +++ b/yggdrasil_decision_forests/port/python/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## 0.4.1- 2024-04-18 + +### Fix + +- Solve dependency collision to YDF between PYDF and TF-DF. If TF-DF is + installed after PYDF, importing YDF will fails with a `has no attribute + 'DType'` error. +- Allow for training on cached TensorFlow dataset. + ## 0.4.0 - 2024-04-10 ### Feature diff --git a/yggdrasil_decision_forests/port/python/config/setup.py b/yggdrasil_decision_forests/port/python/config/setup.py index 3f401c61..02b29597 100644 --- a/yggdrasil_decision_forests/port/python/config/setup.py +++ b/yggdrasil_decision_forests/port/python/config/setup.py @@ -21,7 +21,7 @@ from setuptools.command.install import install from setuptools.dist import Distribution -_VERSION = "0.4.0" +_VERSION = "0.4.1" with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() diff --git a/yggdrasil_decision_forests/port/python/tools/assembly_pip_files.py b/yggdrasil_decision_forests/port/python/tools/assembly_pip_files.py index a479db4d..df061e68 100644 --- a/yggdrasil_decision_forests/port/python/tools/assembly_pip_files.py +++ b/yggdrasil_decision_forests/port/python/tools/assembly_pip_files.py @@ -42,12 +42,35 @@ def rec_glob_copy(src_dir: str, dst_dir: str, pattern: str): s.copy(f"{src_dir}/{frel}", dst) +def replace_in_files(src_dir, extension, old_string, new_string): + """Replaces a string in all files with a given extension within a directory.""" + + for root, _, filenames in os.walk(src_dir): + for filename in filenames: + if filename.endswith(extension): + filepath = os.path.join(root, filename) + + # Read file content + with open(filepath, "r") as f: + file_content = f.read() + + # Replace the string + new_content = file_content.replace(old_string, new_string) + + # Overwrite the file with the modified content + with open(filepath, "w") as f: + f.write(new_content) + + # Remove and recreate the package directory if os.path.exists(DST_PK): try: s.rmtree(DST_PK) except Exception: - print("Fail to remove the existing dir with rmtree. Use rmdir instead.") + print( + "Fail to remove the existing dir with rmtree. Use rmdir instead (only" + " for Windows)." + ) os.system(f"rmdir /S /Q {DST_PK}") os.makedirs(DST_PK) @@ -69,10 +92,10 @@ def rec_glob_copy(src_dir: str, dst_dir: str, pattern: str): os.makedirs(f"{DST_PK}/ydf/learner") s.copy(f"{SRC_BIN}/learner/specialized_learners.py", f"{DST_PK}/ydf/learner") -# The YDF protos +# Copy the YDF c++ protos rec_glob_copy( "bazel-bin/external/ydf_cc/yggdrasil_decision_forests", - f"{DST_PK}/yggdrasil_decision_forests", + f"{DST_PK}/ydf/proto", "**/*.py", ) @@ -81,6 +104,11 @@ def rec_glob_copy(src_dir: str, dst_dir: str, pattern: str): # Create the missing __init__.py files INIT_FILENAME = "__init__.py" -for path, _, files in os.walk(f"{DST_PK}/yggdrasil_decision_forests"): +for path, _, files in os.walk(f"{DST_PK}/ydf/proto"): if INIT_FILENAME not in files: Path(f"{path}/{INIT_FILENAME}").touch() + +# Change path to YDF proto files +replace_in_files( + DST_PK, ".py", "from yggdrasil_decision_forests.", "from ydf.proto." +) diff --git a/yggdrasil_decision_forests/port/python/tools/build_pydf.sh b/yggdrasil_decision_forests/port/python/tools/build_pydf.sh index b4a1413f..cbb3348b 100755 --- a/yggdrasil_decision_forests/port/python/tools/build_pydf.sh +++ b/yggdrasil_decision_forests/port/python/tools/build_pydf.sh @@ -22,17 +22,6 @@ # Usage example: # # Generate the pip package with python3.9 # ./tools/build_pydf.sh python3.9 -# -# # Generate the pip package for all the versions of python using pyenv. -# # Make sure the package are compatible with manylinux2014. -# ./tools/build_pip_package.sh ALL_VERSIONS -# -# Requirements: -# -# pyenv (if using ALL_VERSIONS_ALREADY_ASSEMBLED or ALL_VERSIONS) -# See https://github.com/pyenv/pyenv-installer -# Will be installed by this script if INSTALL_PYENV is set to INSTALL_PYENV. -# set -xve @@ -78,33 +67,7 @@ function check_is_build() { function assemble_files() { check_is_build - rm -fr ${SRCPK} - mkdir -p ${SRCPK} - cp -R ydf config/setup.py config/MANIFEST.in README.md CHANGELOG.md ${SRCPK} - - # When cross-compiling, adapt setup.py - if [ ${ARG} == "ALL_VERSIONS_MAC_CROSSCOMPILE" ]; then - sed -i'.bak' -e "s/MAC_CROSS_COMPILED = False/MAC_CROSS_COMPILED = True/" ${SRCPK}/setup.py - fi - - # YDF's wrappers and .so. - SRCBIN="bazel-bin/ydf" - cp ${SRCBIN}/cc/ydf.so ${SRCPK}/ydf/cc/ - - cp ${SRCBIN}/learner/specialized_learners.py ${SRCPK}/ydf/learner/ - - # YDF's proto wrappers. - YDFSRCBIN="bazel-bin/external/ydf_cc/yggdrasil_decision_forests" - mkdir -p ${SRCPK}/yggdrasil_decision_forests - pushd ${YDFSRCBIN} - find . -name \*.py -exec rsync -R -arv {} ${SRCPK}/yggdrasil_decision_forests \; - popd - - # Copy the license file from YDF - cp bazel-python/external/ydf_cc/LICENSE ${SRCPK} - - # Add __init__.py to all exported Yggdrasil sub-directories. - find ${SRCPK}/yggdrasil_decision_forests -type d -exec touch {}/__init__.py \; + ${PYTHON} tools/assembly_pip_files.py } # Build a pip package. @@ -194,92 +157,5 @@ function e2e_native() { test_package ${PYTHON} ${PACKAGE} } -# Builds and tests a pip package in Pyenv. -function e2e_pyenv() { - VERSION="$1" - shift - - # Don't force updating pyenv, we use a fixed version. - # pyenv update - - ENVNAME=env_${VERSION} - pyenv install ${VERSION} -s - - # Enable pyenv virtual environment. - set +e - pyenv virtualenv ${VERSION} ${ENVNAME} - set -e - pyenv activate ${ENVNAME} - - e2e_native python3 - - # Disable virtual environment. - pyenv deactivate -} - -ARG="$1" -INSTALL_PYENV="$2" -shift | true - -if [ ${INSTALL_PYENV} == "INSTALL_PYENV" ]; then - if ! [ -x "$(command -v pyenv)" ]; then - echo "Pyenv not found." - echo "Installing build deps, pyenv 2.3.7 and pyenv virtualenv 1.2.1" - # Install python dependencies. - if ! is_macos; then - sudo apt-get update - sudo apt-get install -qq make build-essential libssl-dev zlib1g-dev \ - libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \ - libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev \ - libffi-dev liblzma-dev patchelf - fi - git clone https://github.com/pyenv/pyenv.git - ( - cd pyenv && git checkout 74f923b5fca82054b3c579f9eb936338c7f5a394 - ) - PYENV_ROOT="$(pwd)/pyenv" - export PATH="$PYENV_ROOT/bin:$PATH" - eval "$(pyenv init --path)" - eval "$(pyenv init -)" - git clone https://github.com/pyenv/pyenv-virtualenv.git $(pyenv root)/plugins/pyenv-virtualenv - ( - cd $(pyenv root)/plugins/pyenv-virtualenv && git checkout 13bc1877ef06ed038c65dcab4e901da6ea6c67ae - ) - eval "$(pyenv init --path)" - eval "$(pyenv init -)" - eval "$(pyenv virtualenv-init -)" - fi -fi - -if [ -z "${ARG}" ]; then - echo "The first argument should be one of:" - echo " ALL_VERSIONS: Build all pip packages using pyenv." - echo " ALL_VERSIONS_ALREADY_ASSEMBLED: Build all pip packages from already assembled files using pyenv." - echo " ALL_VERSIONS_MAC_CROSSCOMPILE: Build all pip packages from already assembled files using pyenv and cross-compile between MacOS ARM64 / Intel builds." - echo " Python binary (e.g. python3.9): Build a pip package for a specific python version without pyenv." - exit 1 -elif [ ${ARG} == "ALL_VERSIONS" ]; then - # Compile with all the version of python using pyenv. - assemble_files - eval "$(pyenv init -)" - e2e_pyenv 3.9.12 - e2e_pyenv 3.10.4 - e2e_pyenv 3.11.0 -elif [ ${ARG} == "ALL_VERSIONS_ALREADY_ASSEMBLED" ]; then - eval "$(pyenv init -)" - e2e_pyenv 3.9.12 - e2e_pyenv 3.10.4 - e2e_pyenv 3.11.0 -elif [ ${ARG} == "ALL_VERSIONS_MAC_CROSSCOMPILE" ]; then - eval "$(pyenv init -)" - assemble_files - e2e_pyenv 3.9.12 - e2e_pyenv 3.10.4 - e2e_pyenv 3.11.0 -else - # Compile with a specific version of python provided in the call arguments. - assemble_files - PYTHON=${ARG} - e2e_native ${PYTHON} -fi - +PYTHON="$1" +assemble_files ${PYTHON} diff --git a/yggdrasil_decision_forests/port/python/tools/build_windows_release.bat b/yggdrasil_decision_forests/port/python/tools/build_windows_release.bat index 36e287f6..ffb680de 100644 --- a/yggdrasil_decision_forests/port/python/tools/build_windows_release.bat +++ b/yggdrasil_decision_forests/port/python/tools/build_windows_release.bat @@ -34,7 +34,7 @@ cls setlocal -set YDF_VERSION=0.4.0 +set YDF_VERSION=0.4.1 set BAZEL=bazel.exe set BAZEL_SH=C:\msys64\usr\bin\bash.exe set BAZEL_FLAGS=--config=windows_cpp20 --config=windows_avx2 diff --git a/yggdrasil_decision_forests/port/python/ydf/dataset/io/tensorflow_io.py b/yggdrasil_decision_forests/port/python/ydf/dataset/io/tensorflow_io.py index dce32bbc..4b31fb16 100644 --- a/yggdrasil_decision_forests/port/python/ydf/dataset/io/tensorflow_io.py +++ b/yggdrasil_decision_forests/port/python/ydf/dataset/io/tensorflow_io.py @@ -14,9 +14,9 @@ """Connectors for loading data from Pandas dataframes.""" +import logging import sys from typing import Dict - from ydf.dataset.io import dataset_io_types @@ -24,12 +24,26 @@ def is_tensorflow_dataset(data: dataset_io_types.IODataset) -> bool: # Note: We only test if the dataset is a TensorFlow dataset if the object name # look like a TensorFlow object. This way, we avoid importing TF is not # necessary. - return ( - "tensorflow" in str(type(data)) - and data.__class__.__name__ - in ("_BatchDataset", "_MapDataset", "DatasetV1Adapter") - and hasattr(data, "rebatch") - ) + str_class = str(type(data)) + if "tensorflow" in str_class and hasattr(data, "rebatch"): + + if data.__class__.__name__ in ( + "_BatchDataset", + "_MapDataset", + "DatasetV1Adapter", + "CacheDataset", + ): + return True + + if "data.ops" in str_class: + logging.warning( + "The dataset %s object is not listed as a YDF compatible TensorFlow" + " Dataset, but it looks like one", + str_class, + ) + return True + + return False def to_dict( diff --git a/yggdrasil_decision_forests/port/python/ydf/learner/BUILD b/yggdrasil_decision_forests/port/python/ydf/learner/BUILD index 2497ba9e..d1957c8b 100644 --- a/yggdrasil_decision_forests/port/python/ydf/learner/BUILD +++ b/yggdrasil_decision_forests/port/python/ydf/learner/BUILD @@ -302,6 +302,7 @@ py_test( ":generic_learner", ":specialized_learners", # absl/testing:absltest dep, + # absl/testing:parameterized dep, # pandas dep, # tensorflow:tensorflow_no_contrib dep, ], diff --git a/yggdrasil_decision_forests/port/python/ydf/learner/learner_with_tf_test.py b/yggdrasil_decision_forests/port/python/ydf/learner/learner_with_tf_test.py index a19863fe..7d710112 100644 --- a/yggdrasil_decision_forests/port/python/ydf/learner/learner_with_tf_test.py +++ b/yggdrasil_decision_forests/port/python/ydf/learner/learner_with_tf_test.py @@ -14,8 +14,8 @@ """Tests for model learning.""" - from absl.testing import absltest +from absl.testing import parameterized import pandas as pd import tensorflow as tf @@ -34,9 +34,12 @@ def toy_dataset(): return df -class RandomForestLearnerTest(absltest.TestCase): +class RandomForestLearnerTest(parameterized.TestCase): - def test_tensorflow_dataset(self): + @parameterized.parameters({"use_cache": True}, {"use_filter": True}) + def test_tensorflow_dataset( + self, use_cache: bool = False, use_filter: bool = False + ): learner = specialized_learners.RandomForestLearner( label="label", num_trees=1 ) @@ -45,6 +48,10 @@ def test_tensorflow_dataset(self): ) for x in tf_dataset.take(2): print(x) + if use_cache: + tf_dataset = tf_dataset.cache() + if use_filter: + tf_dataset = tf_dataset.filter(lambda x: True) self.assertEqual( learner.train(tf_dataset).task(), generic_learner.Task.CLASSIFICATION ) diff --git a/yggdrasil_decision_forests/port/python/ydf/version.py b/yggdrasil_decision_forests/port/python/ydf/version.py index abe79919..568ef788 100644 --- a/yggdrasil_decision_forests/port/python/ydf/version.py +++ b/yggdrasil_decision_forests/port/python/ydf/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -version = "0.4.0" +version = "0.4.1"