From 28f2d47662a89b2e01d8f9046932201b8eba18e6 Mon Sep 17 00:00:00 2001 From: Nito Buendia Date: Fri, 30 Aug 2024 06:42:19 +0800 Subject: [PATCH] docs: modernize py dependencies docs and example (#32345) * feat: update Python multifile docs A more common approach to packaging Python package is leveraging pyproject.toml files and having a src directory (instead of a flat directory). This change intends to update the documentation and examples to match this way of packaging Python packages. * fix: fix juliaset package path * cleanup: move main file outside src * docs: address feedback #32345 Add build-system to pyproject.toml. Improve wording on documentation. Add extra step when using custom images. * fix: fix juliaset path * nit: remove extra space * lint: format setup.py * nit: reorder entries in pyproject.toml * update the description --------- Co-authored-by: tvalentyn --- .../examples/complete/juliaset/pyproject.toml | 33 ++++++++++++ .../examples/complete/juliaset/setup.py | 26 +++------ .../juliaset/{juliaset => src}/__init__.py | 0 .../juliaset/src/juliaset/__init__.py | 16 ++++++ .../juliaset/{ => src}/juliaset/juliaset.py | 0 .../{ => src}/juliaset/juliaset_test.py | 2 +- .../{ => src}/juliaset/juliaset_test_it.py | 2 +- .../sdks/python-pipeline-dependencies.md | 54 +++++++++++-------- 8 files changed, 91 insertions(+), 42 deletions(-) create mode 100644 sdks/python/apache_beam/examples/complete/juliaset/pyproject.toml rename sdks/python/apache_beam/examples/complete/juliaset/{juliaset => src}/__init__.py (100%) create mode 100644 sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/__init__.py rename sdks/python/apache_beam/examples/complete/juliaset/{ => src}/juliaset/juliaset.py (100%) rename sdks/python/apache_beam/examples/complete/juliaset/{ => src}/juliaset/juliaset_test.py (97%) rename sdks/python/apache_beam/examples/complete/juliaset/{ => src}/juliaset/juliaset_test_it.py (96%) diff --git a/sdks/python/apache_beam/examples/complete/juliaset/pyproject.toml b/sdks/python/apache_beam/examples/complete/juliaset/pyproject.toml new file mode 100644 index 0000000000000..6c865974cbd23 --- /dev/null +++ b/sdks/python/apache_beam/examples/complete/juliaset/pyproject.toml @@ -0,0 +1,33 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +[project] +name = "juliaset" +version = "0.0.1" +description = "Julia set workflow package." + +# Configure the required packages and scripts to install. +# Note that the Python Dataflow containers come with numpy already installed +# so this dependency will not trigger anything to be installed unless a version +# restriction is specified. +dependencies = [ + "numpy" +] + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" diff --git a/sdks/python/apache_beam/examples/complete/juliaset/setup.py b/sdks/python/apache_beam/examples/complete/juliaset/setup.py index c3a9fe0437654..649a5be7db75d 100644 --- a/sdks/python/apache_beam/examples/complete/juliaset/setup.py +++ b/sdks/python/apache_beam/examples/complete/juliaset/setup.py @@ -15,14 +15,16 @@ # limitations under the License. # -"""Setup.py module for the workflow's worker utilities. +"""setup.py module for the pipeline package. -All the workflow related code is gathered in a package that will be built as a -source distribution, staged in the staging area for the workflow being run and -then installed in the workers when they start running. +In this example, the pipeline code is gathered in a package that can be built +as source distribution and installed on the workers. The package is defined +in the pyproject.toml file. You can use setup.py file for defining +configuration that needs to be determined programatically, for example, +custom commands to run when a package is installed. -This behavior is triggered by specifying the --setup_file command line option -when running the workflow for remote execution. +You can install this package into the workers at runtime by using +the --setup_file pipeline option. """ # pytype: skip-file @@ -107,19 +109,7 @@ def run(self): self.RunCustomCommand(command) -# Configure the required packages and scripts to install. -# Note that the Python Dataflow containers come with numpy already installed -# so this dependency will not trigger anything to be installed unless a version -# restriction is specified. -REQUIRED_PACKAGES = [ - 'numpy', -] - setuptools.setup( - name='juliaset', - version='0.0.1', - description='Julia set workflow package.', - install_requires=REQUIRED_PACKAGES, packages=setuptools.find_packages(), cmdclass={ # Command class instantiated and run during pip install scenarios. diff --git a/sdks/python/apache_beam/examples/complete/juliaset/juliaset/__init__.py b/sdks/python/apache_beam/examples/complete/juliaset/src/__init__.py similarity index 100% rename from sdks/python/apache_beam/examples/complete/juliaset/juliaset/__init__.py rename to sdks/python/apache_beam/examples/complete/juliaset/src/__init__.py diff --git a/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/__init__.py b/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/__init__.py new file mode 100644 index 0000000000000..cce3acad34a49 --- /dev/null +++ b/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset.py b/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset.py similarity index 100% rename from sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset.py rename to sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset.py diff --git a/sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset_test.py b/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset_test.py similarity index 97% rename from sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset_test.py rename to sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset_test.py index 6416831f42693..b371c88d360d2 100644 --- a/sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset_test.py +++ b/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset_test.py @@ -27,7 +27,7 @@ import pytest -from apache_beam.examples.complete.juliaset.juliaset import juliaset +from apache_beam.examples.complete.juliaset.src.juliaset import juliaset from apache_beam.testing.util import open_shards diff --git a/sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset_test_it.py b/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset_test_it.py similarity index 96% rename from sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset_test_it.py rename to sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset_test_it.py index a2a3262a1fb6c..0aac9f9cffb9b 100644 --- a/sdks/python/apache_beam/examples/complete/juliaset/juliaset/juliaset_test_it.py +++ b/sdks/python/apache_beam/examples/complete/juliaset/src/juliaset/juliaset_test_it.py @@ -27,7 +27,7 @@ import pytest from hamcrest.core.core.allof import all_of -from apache_beam.examples.complete.juliaset.juliaset import juliaset +from apache_beam.examples.complete.juliaset.src.juliaset import juliaset from apache_beam.io.filesystems import FileSystems from apache_beam.runners.runner import PipelineState from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher diff --git a/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md b/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md index 286e90a370539..2a2a515ec09b0 100644 --- a/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md +++ b/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md @@ -95,43 +95,53 @@ If your pipeline uses packages that are not available publicly (e.g. packages th Often, your pipeline code spans multiple files. To run your project remotely, you must group these files as a Python package and specify the package when you run your pipeline. When the remote workers start, they will install your package. To group your files as a Python package and make it available remotely, perform the following steps: -1. Create a [setup.py](https://pythonhosted.org/an_example_pypi_project/setuptools.html) file for your project. The following is a very basic `setup.py` file. +1. Create a [pyproject.toml](https://packaging.python.org/en/latest/tutorials/packaging-projects/) file for your project. The following is a very basic `pyproject.toml` file. - import setuptools + [build-system] + requires = ["setuptools"] + build-backend = "setuptools.build_meta" + + [project] + name = "PACKAGE-NAME" + version = "PACKAGE-VERSION" + dependencies = [ + # List Python packages your pipeline depends on. + ] - setuptools.setup( - name='PACKAGE-NAME', - version='PACKAGE-VERSION', - install_requires=[ - # List Python packages your pipeline depends on. - ], - packages=setuptools.find_packages(), - ) +2. If your package requires if some programmatic configuration, or you need to use the `--setup_file` pipeline option, create a setup.py file for your project. -2. Structure your project so that the root directory contains the `setup.py` file, the main workflow file, and a directory with the rest of the files, for example: + # Note that the package can be completely defined by pyproject.toml. + # This file is optional. + import setuptools + setuptools.setup() + +3. Structure your project so that the root directory contains the `pyproject.toml`, the `setup.py` file, and a `src/` directory with the rest of the files. For example: root_dir/ + pyproject.toml setup.py - main.py - my_package/ - my_pipeline_launcher.py - my_custom_dofns_and_transforms.py - other_utils_and_helpers.py + src/ + main.py + my_package/ + my_pipeline_launcher.py + my_custom_dofns_and_transforms.py + other_utils_and_helpers.py See [Juliaset](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/complete/juliaset) for an example that follows this project structure. -3. Install your package in the submission environment, for example by using the following command: +4. Install your package in the submission environment, for example by using the following command: pip install -e . -4. Run your pipeline with the following command-line option: +5. If you use a [custom container](#custom-containers), copy and install the package in the container as well. + +6. Run your pipeline with the following command-line option: --setup_file /path/to/setup.py -**Note:** It is not necessary to supply the `--requirements_file` [option](#pypi-dependencies) if the dependencies of your package are defined in the `install_requires` field of the `setup.py` file (see step 1). -However unlike with the `--requirements_file` option, when you use the `--setup_file` option, Beam doesn't stage the dependent packages to the runner. -Only the pipeline package is staged. If they aren't already provided in the runtime environment, -the package dependencies are installed from PyPI at runtime. +**Note:** It is not necessary to supply the `--requirements_file` [option](#pypi-dependencies) if the dependencies of your package are defined in the +`dependencies` field of the `pyproject.toml` file (see step 1). However unlike with the `--requirements_file` option, when you use the `--setup_file` option, Beam doesn't stage the dependent packages to the runner. +Only the pipeline package is staged. If they aren't already provided in the runtime environment, the package dependencies are installed from PyPI at runtime. ## Non-Python Dependencies or PyPI Dependencies with Non-Python Dependencies {#nonpython}