From f8d8552fdda02e17af535c71b0240896651d16c8 Mon Sep 17 00:00:00 2001 From: Christopher Davis Date: Tue, 14 Feb 2023 16:06:00 -0800 Subject: [PATCH 1/9] Allow Polars DataFrames --- altair/vegalite/v5/api.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index d457c006f..36b5d6c16 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -8,6 +8,7 @@ import pandas as pd from toolz.curried import pipe as _pipe import itertools +from importlib.util import find_spec from .schema import core, channels, mixins, Undefined, SCHEMA_URL @@ -108,6 +109,11 @@ def _prepare_data(data, context=None): if context is not None and data_transformers.consolidate_datasets: data = _consolidate_data(data, context) + if find_spec('polars'): + import polars as pl + if isinstance(data, pl.DataFrame): + data = core.Data({"values": data.write_json(row_oriented=True)}) + # if data is still not a recognized type, then return if not isinstance(data, (dict, core.Data)): warnings.warn("data of type {} not recognized".format(type(data))) From 492c5926baef365db33b8f6a2aafa07f262254ab Mon Sep 17 00:00:00 2001 From: Christopher Davis Date: Tue, 14 Feb 2023 16:20:29 -0800 Subject: [PATCH 2/9] black formatting --- altair/vegalite/v5/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index 36b5d6c16..989a0f222 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -109,8 +109,9 @@ def _prepare_data(data, context=None): if context is not None and data_transformers.consolidate_datasets: data = _consolidate_data(data, context) - if find_spec('polars'): + if find_spec("polars"): import polars as pl + if isinstance(data, pl.DataFrame): data = core.Data({"values": data.write_json(row_oriented=True)}) From 1fbb7c104fe31b9d5e8cc1cf57a4178695ceefe5 Mon Sep 17 00:00:00 2001 From: mattijn Date: Thu, 16 Feb 2023 00:13:05 +0100 Subject: [PATCH 3/9] code review suggestions --- altair/utils/data.py | 12 ++++++++++-- altair/vegalite/v5/api.py | 15 ++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index 403b69e82..9669a8f2b 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -76,6 +76,9 @@ def limit_rows(data, max_rows=5000): values = data["values"] else: return data + elif hasattr(data, "__dataframe__"): + if "polars" in type(data).__module__: + values = data if max_rows is not None and len(values) > max_rows: raise MaxRowsError( "The number of rows in your dataset is greater " @@ -152,12 +155,17 @@ def to_values(data): if "values" not in data: raise KeyError("values expected in data dict, but not present.") return data + elif hasattr(data, "__dataframe__"): + # only support for polars dataframe + if "polars" in type(data).__module__: + # currently no sanization on the data + return {"values": data.to_dicts()} def check_data_type(data): """Raise if the data is not a dict or DataFrame.""" - if not isinstance(data, (dict, pd.DataFrame)) and not hasattr( - data, "__geo_interface__" + if not isinstance(data, (dict, pd.DataFrame)) and not any( + hasattr(data, attr) for attr in ["__geo_interface__", "__dataframe__"] ): raise TypeError( "Expected dict, DataFrame or a __geo_interface__ attribute, got: {}".format( diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index 989a0f222..2f36c267d 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -8,7 +8,6 @@ import pandas as pd from toolz.curried import pipe as _pipe import itertools -from importlib.util import find_spec from .schema import core, channels, mixins, Undefined, SCHEMA_URL @@ -98,23 +97,21 @@ def _prepare_data(data, context=None): return data # convert dataframes or objects with __geo_interface__ to dict - if isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"): + elif isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"): data = _pipe(data, data_transformers.get()) # convert string input to a URLData - if isinstance(data, str): + elif isinstance(data, str): data = core.UrlData(data) + elif hasattr(data, "__dataframe__"): + if "polars" in type(data).__module__: + data = _pipe(data, data_transformers.get()) + # consolidate inline data to top-level datasets if context is not None and data_transformers.consolidate_datasets: data = _consolidate_data(data, context) - if find_spec("polars"): - import polars as pl - - if isinstance(data, pl.DataFrame): - data = core.Data({"values": data.write_json(row_oriented=True)}) - # if data is still not a recognized type, then return if not isinstance(data, (dict, core.Data)): warnings.warn("data of type {} not recognized".format(type(data))) From 45bbbb7398e68e6c696d3af6cbfcb16addb6c803 Mon Sep 17 00:00:00 2001 From: mattijn Date: Thu, 16 Feb 2023 08:23:03 +0100 Subject: [PATCH 4/9] improve code comment --- altair/utils/data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index 9669a8f2b..665738b10 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -156,9 +156,12 @@ def to_values(data): raise KeyError("values expected in data dict, but not present.") return data elif hasattr(data, "__dataframe__"): - # only support for polars dataframe + # here we like to provide agnostic dataframe support + # we start with experimental support for polars dataframe if "polars" in type(data).__module__: - # currently no sanization on the data + # currently the polars function .to_dicts() is used to retrieve the data + # but here we would like to explore options using the .__dataframe__() + # currently no sanitization on the data return {"values": data.to_dicts()} From e02b646f7e9eebf9fbd312221073426b0c136c09 Mon Sep 17 00:00:00 2001 From: mattijn Date: Thu, 16 Feb 2023 21:32:05 +0100 Subject: [PATCH 5/9] support dataframe interchange format --- altair/utils/data.py | 50 ++++++++++++++++++++++++++++++++------- altair/vegalite/v5/api.py | 3 +-- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index 665738b10..f915f458b 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -77,8 +77,7 @@ def limit_rows(data, max_rows=5000): else: return data elif hasattr(data, "__dataframe__"): - if "polars" in type(data).__module__: - values = data + values = data if max_rows is not None and len(values) > max_rows: raise MaxRowsError( "The number of rows in your dataset is greater " @@ -101,6 +100,13 @@ def sample(data, n=None, frac=None): n = n if n else int(frac * len(values)) values = random.sample(values, n) return {"values": values} + elif hasattr(data, "__dataframe__"): + # experimental interchange dataframe support + pi = import_pyarrow_interchange() + pa_table = pi.from_dataframe(data) + n = n if n else int(frac * len(pa_table)) + indices = random.sample(range(len(pa_table)), n) + return pa_table.take(indices) @curried.curry @@ -156,13 +162,10 @@ def to_values(data): raise KeyError("values expected in data dict, but not present.") return data elif hasattr(data, "__dataframe__"): - # here we like to provide agnostic dataframe support - # we start with experimental support for polars dataframe - if "polars" in type(data).__module__: - # currently the polars function .to_dicts() is used to retrieve the data - # but here we would like to explore options using the .__dataframe__() - # currently no sanitization on the data - return {"values": data.to_dicts()} + # experimental interchange dataframe support + pi = import_pyarrow_interchange() + pa_table = pi.from_dataframe(data) + return {"values": pa_table.to_pylist()} def check_data_type(data): @@ -201,6 +204,11 @@ def _data_to_json_string(data): if "values" not in data: raise KeyError("values expected in data dict, but not present.") return json.dumps(data["values"], sort_keys=True) + elif hasattr(data, "__dataframe__"): + # experimental interchange dataframe support + pi = import_pyarrow_interchange() + pa_table = pi.from_dataframe(data) + return json.dumps(pa_table.to_pylist()) else: raise NotImplementedError( "to_json only works with data expressed as " "a DataFrame or as a dict" @@ -222,6 +230,15 @@ def _data_to_csv_string(data): if "values" not in data: raise KeyError("values expected in data dict, but not present") return pd.DataFrame.from_dict(data["values"]).to_csv(index=False) + elif hasattr(data, "__dataframe__"): + # experimental interchange dataframe support + pi = import_pyarrow_interchange() + import pyarrow as pa + import pyarrow.csv as pa_csv + pa_table = pi.from_dataframe(data) + csv_buffer = pa.BufferOutputStream() + pa_csv.write_csv(pa_table, csv_buffer) + return csv_buffer.getvalue().to_pybytes().decode() else: raise NotImplementedError( "to_csv only works with data expressed as " "a DataFrame or as a dict" @@ -253,3 +270,18 @@ def curry(*args, **kwargs): AltairDeprecationWarning, ) return curried.curry(*args, **kwargs) + +def import_pyarrow_interchange(): + import pkg_resources + try: + pkg_resources.require("pyarrow>=11.0.0") + # The package is installed and meets the minimum version requirement + import pyarrow.interchange as pi + return pi + except pkg_resources.DistributionNotFound: + # The package is not installed + raise ImportError("The package 'pyarrow' is required, but not installed") + except pkg_resources.VersionConflict: + # The package is installed but does not meet the minimum version requirement + raise ImportError("The installed version of 'pyarrow' does not meet " + "the minimum requirement of version 11.0.0.") \ No newline at end of file diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index 2f36c267d..478c3e5d0 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -105,8 +105,7 @@ def _prepare_data(data, context=None): data = core.UrlData(data) elif hasattr(data, "__dataframe__"): - if "polars" in type(data).__module__: - data = _pipe(data, data_transformers.get()) + data = _pipe(data, data_transformers.get()) # consolidate inline data to top-level datasets if context is not None and data_transformers.consolidate_datasets: From 8db81cd465744e2d6c19bc757a0e3f3c53c4698e Mon Sep 17 00:00:00 2001 From: mattijn Date: Thu, 16 Feb 2023 21:39:10 +0100 Subject: [PATCH 6/9] linting --- altair/utils/data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index f915f458b..8a46fb357 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -235,6 +235,7 @@ def _data_to_csv_string(data): pi = import_pyarrow_interchange() import pyarrow as pa import pyarrow.csv as pa_csv + pa_table = pi.from_dataframe(data) csv_buffer = pa.BufferOutputStream() pa_csv.write_csv(pa_table, csv_buffer) @@ -271,17 +272,22 @@ def curry(*args, **kwargs): ) return curried.curry(*args, **kwargs) + def import_pyarrow_interchange(): import pkg_resources + try: pkg_resources.require("pyarrow>=11.0.0") # The package is installed and meets the minimum version requirement import pyarrow.interchange as pi + return pi except pkg_resources.DistributionNotFound: # The package is not installed raise ImportError("The package 'pyarrow' is required, but not installed") except pkg_resources.VersionConflict: # The package is installed but does not meet the minimum version requirement - raise ImportError("The installed version of 'pyarrow' does not meet " - "the minimum requirement of version 11.0.0.") \ No newline at end of file + raise ImportError( + "The installed version of 'pyarrow' does not meet " + "the minimum requirement of version 11.0.0." + ) From 75e931f9685d7add6cd67424a9119c237e5c4b0e Mon Sep 17 00:00:00 2001 From: mattijn Date: Sat, 18 Feb 2023 11:24:10 +0100 Subject: [PATCH 7/9] include info in release notes and data section --- doc/releases/changes.rst | 1 + doc/user_guide/data.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst index 02978562e..2ba04d4c2 100644 --- a/doc/releases/changes.rst +++ b/doc/releases/changes.rst @@ -22,6 +22,7 @@ Enhancements - The documentation page has been revamped, both in terms of appearance and content. - More informative autocompletion by removing deprecated methods (#2814) and adding support for completion in method chains for editors that rely on type hints (e.g. VS Code) (#2846) - Improved error messages (#2842) +- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__`. This is dependent on `pyarrow>=11.0.0` (#2888) Grammar Changes ~~~~~~~~~~~~~~~ diff --git a/doc/user_guide/data.rst b/doc/user_guide/data.rst index 8ac86cdc2..d89cd8d21 100644 --- a/doc/user_guide/data.rst +++ b/doc/user_guide/data.rst @@ -21,6 +21,7 @@ there are many different ways of specifying a dataset: - as a url string pointing to a ``json`` or ``csv`` formatted text file - as a `geopandas GeoDataFrame `_, `Shapely Geometries `_, `GeoJSON Objects `_ or other objects that support the ``__geo_interface__`` - as a generated dataset such as numerical sequences or geographic reference elements +- as a DataFrame that supports the DataFrame Interchange Protocol (contains a `__dataframe__` attribute). This is experimental. When data is specified as a DataFrame, the encoding is quite simple, as Altair uses the data type information provided by pandas to automatically determine From 17f51f98556c5086e1526c20904e8cb058ce6e36 Mon Sep 17 00:00:00 2001 From: mattijn Date: Sat, 18 Feb 2023 11:25:20 +0100 Subject: [PATCH 8/9] fix typos in sentece --- doc/releases/changes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst index 2ba04d4c2..8e3f25df5 100644 --- a/doc/releases/changes.rst +++ b/doc/releases/changes.rst @@ -22,7 +22,7 @@ Enhancements - The documentation page has been revamped, both in terms of appearance and content. - More informative autocompletion by removing deprecated methods (#2814) and adding support for completion in method chains for editors that rely on type hints (e.g. VS Code) (#2846) - Improved error messages (#2842) -- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__`. This is dependent on `pyarrow>=11.0.0` (#2888) +- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__` attribute). This requires `pyarrow>=11.0.0` (#2888). Grammar Changes ~~~~~~~~~~~~~~~ From e0cda9e29ff96b3d3739da60fc0c8383706c691e Mon Sep 17 00:00:00 2001 From: mattijn Date: Sat, 18 Feb 2023 20:10:25 +0100 Subject: [PATCH 9/9] improve error messages --- altair/utils/data.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index 8a46fb357..9b680d060 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -284,10 +284,12 @@ def import_pyarrow_interchange(): return pi except pkg_resources.DistributionNotFound: # The package is not installed - raise ImportError("The package 'pyarrow' is required, but not installed") + raise ImportError( + "Usage of the DataFrame Interchange Protocol requires the package 'pyarrow', but it is not installed." + ) except pkg_resources.VersionConflict: # The package is installed but does not meet the minimum version requirement raise ImportError( - "The installed version of 'pyarrow' does not meet " - "the minimum requirement of version 11.0.0." + "The installed version of 'pyarrow' does not meet the minimum requirement of version 11.0.0. " + "Please update 'pyarrow' to use the DataFrame Interchange Protocol." )