From 6d47a857436257ef9a478ddc83b578ccba05c9d6 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 7 Jan 2024 17:02:32 +0100 Subject: [PATCH] Make `graphviz` dependency optional (#36647) The `graphviz` dependency has been problematic as Airflow required dependency - especially for ARM-based installations. Graphviz packages require binary graphviz libraries - which is already a limitation, but they also require to install graphviz Python bindings to be build and installed. This does not work for older Linux installation but - more importantly - when you try to install Graphviz libraries for Python 3.8, 3.9 for ARM M1 MacBooks, the packages fail to install because Python bindings compilation for M1 can only work for Python 3.10+. There is not an easy solution for that except commenting out graphviz dependency from setup.py, when you want to install Airflow for Python 3.8, 3.9 for MacBook M1. However Graphviz is really used in two places: * when you want to render DAGs wia airflow CLI - either to an image or directly to terminal (for terminals/systems supporting imgcat) * when you want to render ER diagram after you modified Airflow models The latter is a development-only feature, the former is production feature, however it is a very niche one. This PR turns rendering of the images in Airflow in optional feature (only working when graphviz python bindings are installed) and effectively turns graphviz into an optional extra (and removes it from requirements). This is not a breaking change technically - the CLIs to render the DAGs is still there and IF you already have graphviz installed, it will continue working as it did before. The only problem when it does not work is where you do not have graphviz installed for fresh installation and it will raise an error and inform that you need it. Graphviz will remain to be installed for most users: * the Airflow Image will still contain graphviz library, because it is added there as extra * when previous version of Airflow has been installed already, then graphviz library is already installed there and Airflow will continue working as it did The only change will be a new installation of new version of Airflow from the scratch, where graphviz will need to be specified as extra or installed separately in order to enable DAG rendering option. Taking into account this behaviour (which only requires to install a graphviz package), this should not be considered as a breaking change. Extracted from: #36537 (cherry picked from commit 89f1737afb27f6e708c2e83e3d8e751d9a36f91e) --- CONTRIBUTING.rst | 14 +++++------ Dockerfile | 2 +- INSTALL | 14 +++++------ airflow/utils/dot_renderer.py | 15 +++++++++++- .../src/airflow_breeze/global_constants.py | 1 + docs/apache-airflow/extra-packages-ref.rst | 2 ++ docs/docker-stack/build-arg-ref.rst | 1 + docs/spelling_wordlist.txt | 1 + images/breeze/output_prod-image_build.txt | 2 +- newsfragments/36647.significant.rst | 23 +++++++++++++++++++ setup.cfg | 1 - setup.py | 3 +++ 12 files changed, 61 insertions(+), 18 deletions(-) create mode 100644 newsfragments/36647.significant.rst diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index abb9ed59da4fa..ba3ff9ec98f9b 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -675,13 +675,13 @@ arangodb, asana, async, atlas, atlassian.jira, aws, azure, cassandra, celery, cg cncf.kubernetes, cohere, common.io, common.sql, crypto, databricks, datadog, dbt.cloud, deprecated_api, devel, devel_all, devel_ci, devel_hadoop, dingding, discord, doc, doc_gen, docker, druid, elasticsearch, exasol, facebook, ftp, gcp, gcp_api, github, github_enterprise, google, -google_auth, grpc, hashicorp, hdfs, hive, http, imap, influxdb, jdbc, jenkins, kerberos, kubernetes, -ldap, leveldb, microsoft.azure, microsoft.mssql, microsoft.psrp, microsoft.winrm, mongo, mssql, -mysql, neo4j, odbc, openai, openfaas, openlineage, opensearch, opsgenie, oracle, otel, pagerduty, -pandas, papermill, password, pgvector, pinecone, pinot, postgres, presto, rabbitmq, redis, s3, s3fs, -salesforce, samba, saml, segment, sendgrid, sentry, sftp, singularity, slack, smtp, snowflake, -spark, sqlite, ssh, statsd, tableau, tabular, telegram, trino, vertica, virtualenv, weaviate, -webhdfs, winrm, yandex, zendesk +google_auth, graphviz, grpc, hashicorp, hdfs, hive, http, imap, influxdb, jdbc, jenkins, kerberos, +kubernetes, ldap, leveldb, microsoft.azure, microsoft.mssql, microsoft.psrp, microsoft.winrm, mongo, +mssql, mysql, neo4j, odbc, openai, openfaas, openlineage, opensearch, opsgenie, oracle, otel, +pagerduty, pandas, papermill, password, pgvector, pinecone, pinot, postgres, presto, rabbitmq, +redis, s3, s3fs, salesforce, samba, saml, segment, sendgrid, sentry, sftp, singularity, slack, smtp, +snowflake, spark, sqlite, ssh, statsd, tableau, tabular, telegram, trino, vertica, virtualenv, +weaviate, webhdfs, winrm, yandex, zendesk .. END EXTRAS HERE Provider packages diff --git a/Dockerfile b/Dockerfile index 55ba492e5156c..e493d99ec0e58 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,7 +35,7 @@ # much smaller. # # Use the same builder frontend version for everyone -ARG AIRFLOW_EXTRAS="aiobotocore,amazon,async,celery,cncf.kubernetes,common.io,docker,elasticsearch,ftp,google,google_auth,grpc,hashicorp,http,ldap,microsoft.azure,mysql,odbc,openlineage,pandas,postgres,redis,sendgrid,sftp,slack,snowflake,ssh,statsd,virtualenv" +ARG AIRFLOW_EXTRAS="aiobotocore,amazon,async,celery,cncf.kubernetes,common.io,docker,elasticsearch,ftp,google,google_auth,graphviz,grpc,hashicorp,http,ldap,microsoft.azure,mysql,odbc,openlineage,pandas,postgres,redis,sendgrid,sftp,slack,snowflake,ssh,statsd,virtualenv" ARG ADDITIONAL_AIRFLOW_EXTRAS="" ARG ADDITIONAL_PYTHON_DEPS="" diff --git a/INSTALL b/INSTALL index 9630c50985238..832fd8603d7d3 100644 --- a/INSTALL +++ b/INSTALL @@ -101,13 +101,13 @@ arangodb, asana, async, atlas, atlassian.jira, aws, azure, cassandra, celery, cg cncf.kubernetes, cohere, common.io, common.sql, crypto, databricks, datadog, dbt.cloud, deprecated_api, devel, devel_all, devel_ci, devel_hadoop, dingding, discord, doc, doc_gen, docker, druid, elasticsearch, exasol, facebook, ftp, gcp, gcp_api, github, github_enterprise, google, -google_auth, grpc, hashicorp, hdfs, hive, http, imap, influxdb, jdbc, jenkins, kerberos, kubernetes, -ldap, leveldb, microsoft.azure, microsoft.mssql, microsoft.psrp, microsoft.winrm, mongo, mssql, -mysql, neo4j, odbc, openai, openfaas, openlineage, opensearch, opsgenie, oracle, otel, pagerduty, -pandas, papermill, password, pgvector, pinecone, pinot, postgres, presto, rabbitmq, redis, s3, s3fs, -salesforce, samba, saml, segment, sendgrid, sentry, sftp, singularity, slack, smtp, snowflake, -spark, sqlite, ssh, statsd, tableau, tabular, telegram, trino, vertica, virtualenv, weaviate, -webhdfs, winrm, yandex, zendesk +google_auth, graphviz, grpc, hashicorp, hdfs, hive, http, imap, influxdb, jdbc, jenkins, kerberos, +kubernetes, ldap, leveldb, microsoft.azure, microsoft.mssql, microsoft.psrp, microsoft.winrm, mongo, +mssql, mysql, neo4j, odbc, openai, openfaas, openlineage, opensearch, opsgenie, oracle, otel, +pagerduty, pandas, papermill, password, pgvector, pinecone, pinot, postgres, presto, rabbitmq, +redis, s3, s3fs, salesforce, samba, saml, segment, sendgrid, sentry, sftp, singularity, slack, smtp, +snowflake, spark, sqlite, ssh, statsd, tableau, tabular, telegram, trino, vertica, virtualenv, +weaviate, webhdfs, winrm, yandex, zendesk # END EXTRAS HERE # For installing Airflow in development environments - see CONTRIBUTING.rst diff --git a/airflow/utils/dot_renderer.py b/airflow/utils/dot_renderer.py index 41281fbbb1610..4d44d1e2ecf14 100644 --- a/airflow/utils/dot_renderer.py +++ b/airflow/utils/dot_renderer.py @@ -19,9 +19,14 @@ """Renderer DAG (tasks and dependencies) to the graphviz object.""" from __future__ import annotations +import warnings from typing import TYPE_CHECKING, Any -import graphviz +try: + import graphviz +except ImportError: + warnings.warn("Could not import graphviz. Rendering graph to the graphical format will not be possible.") + graphviz = None from airflow.exceptions import AirflowException from airflow.models.baseoperator import BaseOperator @@ -151,6 +156,10 @@ def render_dag_dependencies(deps: dict[str, list[DagDependency]]) -> graphviz.Di :param deps: List of DAG dependencies :return: Graphviz object """ + if not graphviz: + raise AirflowException( + "Could not import graphviz. Install the graphviz python package to fix this error." + ) dot = graphviz.Digraph(graph_attr={"rankdir": "LR"}) for dag, dependencies in deps.items(): @@ -179,6 +188,10 @@ def render_dag(dag: DAG, tis: list[TaskInstance] | None = None) -> graphviz.Digr :param tis: List of task instances :return: Graphviz object """ + if not graphviz: + raise AirflowException( + "Could not import graphviz. Install the graphviz python package to fix this error." + ) dot = graphviz.Digraph( dag.dag_id, graph_attr={ diff --git a/dev/breeze/src/airflow_breeze/global_constants.py b/dev/breeze/src/airflow_breeze/global_constants.py index 3df224b0b1079..15e6bbec94ef8 100644 --- a/dev/breeze/src/airflow_breeze/global_constants.py +++ b/dev/breeze/src/airflow_breeze/global_constants.py @@ -437,6 +437,7 @@ def get_airflow_extras(): "ftp", "google", "google_auth", + "graphviz", "grpc", "hashicorp", "http", diff --git a/docs/apache-airflow/extra-packages-ref.rst b/docs/apache-airflow/extra-packages-ref.rst index 324b5e3023097..0b6a51fd32aa4 100644 --- a/docs/apache-airflow/extra-packages-ref.rst +++ b/docs/apache-airflow/extra-packages-ref.rst @@ -52,6 +52,8 @@ python dependencies for the provided package. +---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+ | google_auth | ``pip install 'apache-airflow[google_auth]'`` | Google auth backend | +---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+ +| graphviz | ``pip install 'apache-airflow[graphviz]'`` | Graphviz renderer for converting DAG to graphical output | ++---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+ | kerberos | ``pip install 'apache-airflow[kerberos]'`` | Kerberos integration for Kerberized services (Hadoop, Presto, Trino) | +---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+ | ldap | ``pip install 'apache-airflow[ldap]'`` | LDAP authentication for users | diff --git a/docs/docker-stack/build-arg-ref.rst b/docs/docker-stack/build-arg-ref.rst index a07760558eed2..73c30a3892863 100644 --- a/docs/docker-stack/build-arg-ref.rst +++ b/docs/docker-stack/build-arg-ref.rst @@ -91,6 +91,7 @@ List of default extras in the production Dockerfile: * ftp * google * google_auth +* graphviz * grpc * hashicorp * http diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 56198798b412f..e7b4b509b93aa 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -685,6 +685,7 @@ googleapiclient GoogleDisplayVideo gpu gpus +graphviz greenlet Groupalia groupId diff --git a/images/breeze/output_prod-image_build.txt b/images/breeze/output_prod-image_build.txt index 9e5a2533b252b..084f184b36ae0 100644 --- a/images/breeze/output_prod-image_build.txt +++ b/images/breeze/output_prod-image_build.txt @@ -1 +1 @@ -0a11dcb596c1bf9fd08489bbba695aad +ed924a9f4691700717cb77864ba9603a diff --git a/newsfragments/36647.significant.rst b/newsfragments/36647.significant.rst new file mode 100644 index 0000000000000..dc3f0faad8a26 --- /dev/null +++ b/newsfragments/36647.significant.rst @@ -0,0 +1,23 @@ +Graphviz dependency is now an optional one, not required one. + +The ``graphviz`` dependency has been problematic as Airflow required dependency - especially for +ARM-based installations. Graphviz packages require binary graphviz libraries - which is already a +limitation, but they also require to install graphviz Python bindings to be build and installed. +This does not work for older Linux installation but - more importantly - when you try to install +Graphviz libraries for Python 3.8, 3.9 for ARM M1 MacBooks, the packages fail to install because +Python bindings compilation for M1 can only work for Python 3.10+. + +This is not a breaking change technically - the CLIs to render the DAGs is still there and IF you +already have graphviz installed, it will continue working as it did before. The only problem when it +does not work is where you do not have graphviz installed it will raise an error and inform that you need it. + +Graphviz will remain to be installed for most users: + +* the Airflow Image will still contain graphviz library, because + it is added there as extra +* when previous version of Airflow has been installed already, then + graphviz library is already installed there and Airflow will + continue working as it did + +The only change will be a new installation of new version of Airflow from the scratch, where graphviz will +need to be specified as extra or installed separately in order to enable DAG rendering option. diff --git a/setup.cfg b/setup.cfg index 0a2478b618542..ab7b34511cf40 100644 --- a/setup.cfg +++ b/setup.cfg @@ -108,7 +108,6 @@ install_requires = flask-wtf>=0.15 fsspec>=2023.10.0 google-re2>=1.0 - graphviz>=0.12 gunicorn>=20.1.0 httpx importlib_metadata>=1.7;python_version<"3.9" diff --git a/setup.py b/setup.py index 9d118bcfdba03..2b01c28a6b709 100644 --- a/setup.py +++ b/setup.py @@ -318,12 +318,14 @@ def write_version(filename: str = str(AIRFLOW_SOURCES_ROOT / "airflow" / "git_ve ] doc_gen = [ "eralchemy2", + "graphviz>=0.12", ] flask_appbuilder_oauth = [ "authlib>=1.0.0", # The version here should be upgraded at the same time as flask-appbuilder in setup.cfg "flask-appbuilder[oauth]==4.3.10", ] +graphviz = ["graphviz>=0.12"] kerberos = [ "pykerberos>=1.1.13", "requests_kerberos>=0.10.0", @@ -589,6 +591,7 @@ def get_unique_dependency_list(req_list_iterable: Iterable[list[str]]): "deprecated_api": deprecated_api, "github_enterprise": flask_appbuilder_oauth, "google_auth": flask_appbuilder_oauth, + "graphviz": graphviz, "kerberos": kerberos, "ldap": ldap, "leveldb": leveldb,