diff --git a/yggdrasil_decision_forests/port/python/.bazelrc b/yggdrasil_decision_forests/port/python/.bazelrc index cf64a4fc..9872150f 100644 --- a/yggdrasil_decision_forests/port/python/.bazelrc +++ b/yggdrasil_decision_forests/port/python/.bazelrc @@ -1,8 +1,5 @@ # Bazel configuration for Yggdrasil Decision Forests -# Common flags. -common --experimental_repo_remote_exec - # On Windows, uncomment the next line to solve long path issues: # startup --output_user_root=C:/tmpbld diff --git a/yggdrasil_decision_forests/port/python/.bazelversion b/yggdrasil_decision_forests/port/python/.bazelversion index e230c839..4be2c727 100644 --- a/yggdrasil_decision_forests/port/python/.bazelversion +++ b/yggdrasil_decision_forests/port/python/.bazelversion @@ -1 +1 @@ -5.3.0 \ No newline at end of file +6.5.0 \ No newline at end of file diff --git a/yggdrasil_decision_forests/port/python/CHANGELOG.md b/yggdrasil_decision_forests/port/python/CHANGELOG.md index ca830a06..89d10bd0 100644 --- a/yggdrasil_decision_forests/port/python/CHANGELOG.md +++ b/yggdrasil_decision_forests/port/python/CHANGELOG.md @@ -1,17 +1,39 @@ # Changelog -## HEAD +## 0.3.0 - 2024-03-15 -## Breaking +### Breaking - Custom losses now require to provide the gradient, instead of the negative of the gradient. -- Clarified that YDF may modify numpy arrays containing the custom loss. +- Clarified that YDF may modify numpy arrays returned by a custom loss + function. ### Features +- Allow using Jax for custom loss definitions. - Allow setting `may_trigger_gc` on custom losses. - Add support for MHLD oblique decision trees. +- Expose hyperparameter `sparse_oblique_max_num_projections`. +- HTML plots for trees with `model.plot_tree()`. +- Fix protobuf version to 4.24.3 to fix some incompatibilities when using + conda. +- Allow to list compatible engines with `model.list_compatible_engines()`. +- Allow to choose a fast engine with `model.force_engine(...)`. + +### Fix + +- Fix slow engine creation for some combination of oblique splits. +- Improve error message when feeding multi-dimensional labels. + +### Documentation + +- Clarified documentation of hyperparameters for oblique splits. +- Fix plots, typos. + +#### Release music + +Doctor Gradus ad Parnassum from "Children's Corner" (L. 113). Claude Debussy ## 0.2.0 - 2024-02-22 diff --git a/yggdrasil_decision_forests/port/python/INSTALLATION.md b/yggdrasil_decision_forests/port/python/INSTALLATION.md new file mode 100644 index 00000000..84dae257 --- /dev/null +++ b/yggdrasil_decision_forests/port/python/INSTALLATION.md @@ -0,0 +1,96 @@ +# Building and installing YDF + +## Install from PyPi + +To install YDF, run: + +``` +pip install ydf --upgrade +``` + +## Building + +### Pre-work + +Use `tools/update_version.sh` to update the version number (if needed) and +remember to update `CHANGELOG.md`. + +### Linux + +#### Docker + +For building manylinux2014-compatible packages, you can use an appropriate +Docker image. The pre-configured build script at +`tools/build_linux_release_in_docker.sh` starts a container and builds the +wheels end-to-end. You can find the wheels in the `dist/`subdirectory. + +#### Manual build + +Note that we may not be able to help with issues during manual builds. + +**Requirements** + +* Bazel - version as specified in `.bazelversion`, + [Bazelisk](https://github.com/bazelbuild/bazelisk) recommended +* GCC >= 9 or Clang >= 14 +* rsync +* Python headers (e.g. `python-dev` package on Ubuntu) +* Python virtualenv + +**Steps** + +1. Compile and test the code with + + ```shell + # Create a virtual environment where Python dependencies will be installed. + python -m venv myvenv + RUN_TESTS=1 ./tools/test_pydf.sh + deactivate + ``` + + Substitute for your compiler name / version + +1. Build the Pip package + + ```shell + PYTHON_BIN=python + ./tools/build_pydf.sh $PYTHON_BIN + ``` + + If you want to build with [Pyenv](https://github.com/pyenv/pyenv) for all supported Python versions, run + + ```shell + ./tools/build_pydf.sh ALL_VERSIONS + ``` + +### MacOS + +**Requirements** + +* Bazel (version as specified in `.bazelversion`, + [Bazelisk](https://github.com/bazelbuild/bazelisk) recommended) +* XCode command line tools +* [Pyenv](https://github.com/pyenv/pyenv) + +**Building for all supported Python versions** + +Simply run + +```shell +./tools/build_macos_release.sh +``` +This will build a MacOS wheel for every supported Python version on the current +architecture. See the contents of this script for details about the build. + +### MacOS cross-compilation + +We have not tested MacOS cross-compilation (Intel <-> ARM) for YDF yet, though +it is on our roadmap. + +### AArch64 + +We have not tested AArch64 compilation for YDF yet. + +### Windows + +TODO, see `tools/build.bat`. diff --git a/yggdrasil_decision_forests/port/python/README.md b/yggdrasil_decision_forests/port/python/README.md index 1c1cedcc..5ca4ce7c 100644 --- a/yggdrasil_decision_forests/port/python/README.md +++ b/yggdrasil_decision_forests/port/python/README.md @@ -18,6 +18,8 @@ To install YDF, in Python, simply grab the package from pip: pip install ydf ``` +For build instructions, see INSTALLATION.md. + ## Usage Example ```python @@ -37,19 +39,6 @@ model.save("my_model") loaded_model = ydf.load_model("my_model") ``` -## Compiling & Building - -To build the Python port of YDF, install Bazel, GCC 9 and run the following -command from the root of the port/python directory in the YDF repository - -```sh -PYTHON_BIN=python3.9 -./tools/test_pydf.sh -./tools/build_pydf.sh $PYTHON_BIN -``` - -Browse the `tools/` directory for more build helpers. - ## Frequently Asked Questions * **Is it PYDF or YDF?** The name of the library is simply ydf, and so is the diff --git a/yggdrasil_decision_forests/port/python/config/setup.py b/yggdrasil_decision_forests/port/python/config/setup.py index 56500bfe..f7e3b343 100644 --- a/yggdrasil_decision_forests/port/python/config/setup.py +++ b/yggdrasil_decision_forests/port/python/config/setup.py @@ -21,7 +21,7 @@ from setuptools.command.install import install from setuptools.dist import Distribution -_VERSION = "0.2.0" +_VERSION = "0.3.0" with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() @@ -34,6 +34,8 @@ OPTIONAL_PACKAGES = {"pandas": ["pandas"]} +MAC_CROSS_COMPILED = False # Change if cross-compiled + class InstallPlatlib(install): @@ -63,12 +65,13 @@ def finalize_options(self): def get_tag(self): python, abi, plat = _bdist_wheel.get_tag(self) - if platform.system() == "Darwin": - # Uncomment on of the lines below to adapt the platform string when - # cross-compiling. - # plat = "macosx_12_0_arm64" - # plat = "macosx_10_15_x86_64" - pass + if platform.system() == "Darwin" and MAC_CROSS_COMPILED: + if platform.processor() == "arm": + plat = "macosx_10_15_x86_64" + elif platform.processor() == "i386": + plat = "macosx_12_0_arm64" + else: + raise ValueError(f"Unknown processor {platform.processor()}") return python, abi, plat except ImportError: diff --git a/yggdrasil_decision_forests/port/python/examples/minimal.py b/yggdrasil_decision_forests/port/python/examples/minimal.py new file mode 100644 index 00000000..b42fa38d --- /dev/null +++ b/yggdrasil_decision_forests/port/python/examples/minimal.py @@ -0,0 +1,66 @@ +# Copyright 2022 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Minimal usage example of YDF. + +This example trains, displays, evaluates and exports a Gradient Boosted Tree +model. + +Usage example: + + pip install ydf pandas -U + python minimal.py +""" + +from absl import app +import pandas as pd +import ydf + + +def main(argv): + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + + # Download the Adult dataset, load in a Pandas dataframe. + train_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/adult_train.csv" + test_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/adult_train.csv" + train_df = pd.read_csv(train_path) + test_df = pd.read_csv(test_path) + + # Display full logs + ydf.verbose(2) + + # Trains the model. + model = ydf.GradientBoostedTreesLearner(label="income").train(train_df) + + # Some information about the model. + print(model.describe()) + + # Evaluates the model on the test dataset. + evaluation = model.evaluate(test_df) + print(evaluation) + + # Exports the model to disk. + model.save("/tmp/ydf_model") + + # Reload the model from disk + loaded_model = ydf.load_model("/tmp/ydf_model") + + # Make predictions with the model from disk. + predictions = loaded_model.predict(test_df) + print(predictions) + + +if __name__ == "__main__": + app.run(main) diff --git a/yggdrasil_decision_forests/port/python/tools/build_linux_release.sh b/yggdrasil_decision_forests/port/python/tools/build_linux_release.sh index 5f013689..93bebfd4 100755 --- a/yggdrasil_decision_forests/port/python/tools/build_linux_release.sh +++ b/yggdrasil_decision_forests/port/python/tools/build_linux_release.sh @@ -24,7 +24,8 @@ function build_py() { $PYTHON -m venv /tmp/venv_$PYTHON source /tmp/venv_$PYTHON/bin/activate bazel clean --expunge - COMPILERS="gcc" ./tools/test_pydf.sh + export CC="gcc" + ./tools/test_pydf.sh ./tools/build_pydf.sh python } diff --git a/yggdrasil_decision_forests/port/python/tools/build_linux_release_in_docker.sh b/yggdrasil_decision_forests/port/python/tools/build_linux_release_in_docker.sh index 9bbdd4a3..53be237d 100755 --- a/yggdrasil_decision_forests/port/python/tools/build_linux_release_in_docker.sh +++ b/yggdrasil_decision_forests/port/python/tools/build_linux_release_in_docker.sh @@ -14,17 +14,18 @@ # limitations under the License. -DOCKER=gcr.io/tfx-oss-public/manylinux2014-bazel:bazel-5.3.0 +DOCKER=quay.io/pypa/manylinux2014_x86_64@sha256:2e37241d9c9fbbccea009e59505a1384f9501a7bfea77b21fdcbf332c7036e70 -# Current directory -# Useful if Yggdrasil Decision Forests is available locally in a neighbor -# directory. +BAZELISK_VERSION="v1.19.0" YDF_PATH=$(realpath $PWD/../../..) -YDF_DIRNAME=${YDF_PATH##*/} # Download docker -sudo docker pull ${DOCKER} +docker pull $DOCKER -# Start docker -sudo docker run -it -v ${PWD}/../../../../:/working_dir -w /working_dir/${YDF_DIRNAME}/yggdrasil_decision_forests/port/python ${DOCKER} \ - /bin/bash -c "./tools/build_linux_release.sh" +# Start the container +docker run -it -v $YDF_PATH:/working_dir -w /working_dir/yggdrasil_decision_forests/port/python \ + $DOCKER /bin/bash -c " \ + yum update && yum install -y rsync && \ + curl -L -o /usr/local/bin/bazel https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_VERSION}/bazelisk-linux-amd64 && \ + chmod +x /usr/local/bin/bazel && \ + ./tools/build_linux_release.sh " diff --git a/yggdrasil_decision_forests/port/python/tools/build_macos_release.sh b/yggdrasil_decision_forests/port/python/tools/build_macos_release.sh new file mode 100755 index 00000000..24101252 --- /dev/null +++ b/yggdrasil_decision_forests/port/python/tools/build_macos_release.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright 2022 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +set -vex + +declare -a python_versions=("3.8" "3.9" "3.10" "3.11") + +for pyver in "${python_versions[@]}" +do + pyenv install -s $pyver + export PYENV_VERSION=$pyver + rm -rf ${TMPDIR}venv + python -m venv ${TMPDIR}venv + source ${TMPDIR}venv/bin/activate + pip install --upgrade pip + + echo "Building with $(python3 -V 2>&1)" + + bazel clean --expunge + RUN_TESTS=0 CC="clang" ./tools/test_pydf.sh + ./tools/build_pydf.sh python + deactivate +done \ No newline at end of file diff --git a/yggdrasil_decision_forests/port/python/tools/build_pydf.sh b/yggdrasil_decision_forests/port/python/tools/build_pydf.sh index ed6e75ab..079399fd 100755 --- a/yggdrasil_decision_forests/port/python/tools/build_pydf.sh +++ b/yggdrasil_decision_forests/port/python/tools/build_pydf.sh @@ -15,7 +15,7 @@ -# Create the Yggdrasil Decision Forests pip package. +# Create the YDF pip package. # This command uses the compiled artifacts generated by tools/test_pydf.sh. # It should be run from the PYDF workspace root. # @@ -27,19 +27,12 @@ # # Make sure the package are compatible with manylinux2014. # ./tools/build_pip_package.sh ALL_VERSIONS # -# TODO: Add compilation on MacOS and Windows -# # Requirements: # # pyenv (if using ALL_VERSIONS_ALREADY_ASSEMBLED or ALL_VERSIONS) # See https://github.com/pyenv/pyenv-installer # Will be installed by this script if INSTALL_PYENV is set to INSTALL_PYENV. # -# Auditwheel -# Auditwheel is required for Linux builds. -# Auditwheel needs to be version 5.2.0. The script will attempt to -# update Auditwheel to this version. -# set -xve @@ -51,16 +44,6 @@ function is_macos() { # Temporary directory used to assemble the package. SRCPK="$(pwd)/tmp_package" -function check_auditwheel() { - PYTHON="$1" - shift - local auditwheel_version="$(${PYTHON} -m pip show auditwheel | grep "Version:")" - if [ "$auditwheel_version" != "Version: 5.2.0" ]; then - echo "Auditwheel needs to be Version 5.2.0, currently ${auditwheel_version}" - exit 1 - fi -} - # Pypi package version compatible with a given version of python. # Example: Python3.8.2 => Package version: "38" function python_to_package_version() { @@ -78,7 +61,7 @@ function install_dependencies() { ${PYTHON} -m pip install setuptools -U ${PYTHON} -m pip install build -U ${PYTHON} -m pip install virtualenv -U - ${PYTHON} -m pip install auditwheel==5.2.0 + ${PYTHON} -m pip install auditwheel==6.0.0 --force-reinstall } function check_is_build() { @@ -98,9 +81,9 @@ function assemble_files() { mkdir -p ${SRCPK} cp -R ydf config/setup.py config/MANIFEST.in README.md CHANGELOG.md ${SRCPK} - # When cross-compiling, adapt the platform string. - if [ ${ARG} == "ALL_VERSIONS_MAC_INTEL_CROSSCOMPILE" ]; then - sed -i'.bak' -e "s/# plat = \"macosx_10_15_x86_64\"/plat = \"macosx_10_15_x86_64\"/" ${SRCPK}/setup.py + # When cross-compiling, adapt setup.py + if [ ${ARG} == "ALL_VERSIONS_MAC_CROSSCOMPILE" ]; then + sed -i'.bak' -e "s/MAC_CROSS_COMPILED = False/MAC_CROSS_COMPILED = True/" ${SRCPK}/setup.py fi # YDF's wrappers and .so. @@ -137,7 +120,7 @@ function build_package() { # Tests a pip package. function test_package() { - if [ ${ARG} == "ALL_VERSIONS_MAC_INTEL_CROSSCOMPILE" ]; then + if [ ${ARG} == "ALL_VERSIONS_MAC_CROSSCOMPILE" ]; then echo "Cross-compiled packages cannot be tested on the machine they're built with." return fi @@ -160,29 +143,34 @@ function test_package() { ${PIP} show ydf -f # Run a small example (in different folder to avoid clashes) - # TODO: Implement a small test. + local current_folder=$(basename "$PWD") pushd .. - ${PYTHON} -c "import ydf" + ${PIP} install pandas + ${PYTHON} $current_folder/examples/minimal.py popd - # rm -rf previous_package - # mkdir previous_package - # ${PYTHON} -m pip download --no-deps -d previous_package ydf - # local old_file_size=`du -k "previous_package" | cut -f1` - # local new_file_size=`du -k $PACKAGEPATH | cut -f1` - # local scaled_old_file_size=$(($old_file_size * 12)) - # local scaled_new_file_size=$(($new_file_size * 10)) - # if [ "$scaled_new_file_size" -gt "$scaled_old_file_size" ]; then - # echo "New package is 20% larger than the previous one." - # echo "This probably indicates a problem, aborting." - # exit 1 - # fi - # scaled_old_file_size=$(($old_file_size * 8)) - # if [ "$scaled_new_file_size" -lt "$scaled_old_file_size" ]; then - # echo "New package is 20% smaller than the previous one." - # echo "This probably indicates a problem, aborting." - # exit 1 - # fi + if [ -d previous_package ]; then + rm -r previous_package + fi + mkdir previous_package + ${PYTHON} -m pip download --no-deps -d previous_package ydf + local old_file_size=`du -k "previous_package" | cut -f1` + local new_file_size=`du -k $PACKAGEPATH | cut -f1` + local scaled_old_file_size=$(($old_file_size * 12)) + local scaled_new_file_size=$(($new_file_size * 10)) + if [ "$scaled_new_file_size" -gt "$scaled_old_file_size" ]; then + echo "New package is 20% larger than the previous one." + echo "This may indicates an issue with the wheel, aborting." + exit 1 + fi + scaled_old_file_size=$(($old_file_size * 8)) + if [ "$scaled_new_file_size" -lt "$scaled_old_file_size" ]; then + echo "New package is 20% smaller than the previous one." + echo "This may indicates an issue with the wheel, aborting." + exit 1 + fi + rm -r previous_package + echo "Testing $PACKAGEPATH successful" } # Builds and tests a pip package in a given version of python @@ -198,7 +186,6 @@ function e2e_native() { if is_macos; then PACKAGEPATH="dist/ydf-*-cp${PACKAGE}-cp${PACKAGE}*-*.whl" else - check_auditwheel ${PYTHON} PACKAGEPATH="dist/ydf-*-cp${PACKAGE}-cp${PACKAGE}*-linux_x86_64.whl" ${PYTHON} -m auditwheel repair --plat manylinux2014_x86_64 -w dist ${PACKAGEPATH} fi @@ -267,6 +254,7 @@ if [ -z "${ARG}" ]; then echo "The first argument should be one of:" echo " ALL_VERSIONS: Build all pip packages using pyenv." echo " ALL_VERSIONS_ALREADY_ASSEMBLED: Build all pip packages from already assembled files using pyenv." + echo " ALL_VERSIONS_MAC_CROSSCOMPILE: Build all pip packages from already assembled files using pyenv and cross-compile between MacOS ARM64 / Intel builds." echo " Python binary (e.g. python3.9): Build a pip package for a specific python version without pyenv." exit 1 elif [ ${ARG} == "ALL_VERSIONS" ]; then @@ -281,7 +269,7 @@ elif [ ${ARG} == "ALL_VERSIONS_ALREADY_ASSEMBLED" ]; then e2e_pyenv 3.9.12 e2e_pyenv 3.10.4 e2e_pyenv 3.11.0 -elif [ ${ARG} == "ALL_VERSIONS_MAC_INTEL_CROSSCOMPILE" ]; then +elif [ ${ARG} == "ALL_VERSIONS_MAC_CROSSCOMPILE" ]; then eval "$(pyenv init -)" assemble_files e2e_pyenv 3.9.12 @@ -293,3 +281,4 @@ else PYTHON=${ARG} e2e_native ${PYTHON} fi + diff --git a/yggdrasil_decision_forests/port/python/tools/test_pydf.sh b/yggdrasil_decision_forests/port/python/tools/test_pydf.sh index 61c55144..2c62332b 100755 --- a/yggdrasil_decision_forests/port/python/tools/test_pydf.sh +++ b/yggdrasil_decision_forests/port/python/tools/test_pydf.sh @@ -20,26 +20,22 @@ # # Options: # RUN_TESTS: Run the unit tests, 0 or 1 (default). -# COMPILERS: Compilers to build, separated by semicolon. Defaults to gcc-9 # # Usage example: # -# # Compilation with GCC 9, C++17. Running tests. -# ./tools/test_pydf.sh -# # # Compilation with Clang 14, without tests -# COMPILERS="clang-14" RUN_TESTS=0 ./tools/test_pydf.sh +# CC="clang-14" RUN_TESTS=0 ./tools/test_pydf.sh # set -xev build_and_maybe_test () { echo "Building PYDF the following settings:" - echo " Compiler : $1" + echo " Compiler : $CC" BAZEL=bazel ${BAZEL} version - local flags="--config=linux_cpp17 --config=linux_avx2 --features=-fully_static_link --repo_env=CC=${1}" + local flags="--config=linux_cpp17 --config=linux_avx2 --features=-fully_static_link" local pydf_targets="//ydf/...:all" # Install PYDF components python -m pip install -r requirements.txt @@ -54,14 +50,9 @@ build_and_maybe_test () { main () { # Set default values - : "${COMPILERS:="gcc-9"}" : "${RUN_TESTS:=1}" - local compilers_array=(${COMPILERS//;/ }) - -for compiler in ${compilers_array[@]}; do - build_and_maybe_test "$compiler" -done + build_and_maybe_test } main \ No newline at end of file diff --git a/yggdrasil_decision_forests/port/python/tools/update_version.sh b/yggdrasil_decision_forests/port/python/tools/update_version.sh index 5ef3289a..11665ef2 100755 --- a/yggdrasil_decision_forests/port/python/tools/update_version.sh +++ b/yggdrasil_decision_forests/port/python/tools/update_version.sh @@ -9,10 +9,11 @@ confirmation () { } # Warning message. -echo "You are about to release a new version of the YDF Python API" +echo "You are about to prepare the release of a new version of the YDF Python API" confirmation -SRC="third_party/yggdrasil_decision_forests/port/python" +SRC="" + # Get version CURRENT_VERSION=$(cat ${SRC}/ydf/version.py | grep -o "[\.0-9]\+") diff --git a/yggdrasil_decision_forests/port/python/ydf/cc/BUILD b/yggdrasil_decision_forests/port/python/ydf/cc/BUILD index d182b371..40ea0854 100644 --- a/yggdrasil_decision_forests/port/python/ydf/cc/BUILD +++ b/yggdrasil_decision_forests/port/python/ydf/cc/BUILD @@ -21,7 +21,6 @@ pybind_extension( "@com_google_pybind11_protobuf//pybind11_protobuf:native_proto_caster", ], ) - # Libraries # ========= diff --git a/yggdrasil_decision_forests/port/python/ydf/version.py b/yggdrasil_decision_forests/port/python/ydf/version.py index b3d138ae..24123add 100644 --- a/yggdrasil_decision_forests/port/python/ydf/version.py +++ b/yggdrasil_decision_forests/port/python/ydf/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -version = "0.2.0" +version = "0.3.0"