From f8d8552fdda02e17af535c71b0240896651d16c8 Mon Sep 17 00:00:00 2001
From: Christopher Davis <christopherjdavis@gmail.com>
Date: Tue, 14 Feb 2023 16:06:00 -0800
Subject: [PATCH 1/9] Allow Polars DataFrames

---
 altair/vegalite/v5/api.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py
index d457c006f..36b5d6c16 100644
--- a/altair/vegalite/v5/api.py
+++ b/altair/vegalite/v5/api.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from toolz.curried import pipe as _pipe
 import itertools
+from importlib.util import find_spec
 
 from .schema import core, channels, mixins, Undefined, SCHEMA_URL
 
@@ -108,6 +109,11 @@ def _prepare_data(data, context=None):
     if context is not None and data_transformers.consolidate_datasets:
         data = _consolidate_data(data, context)
 
+    if find_spec('polars'):
+        import polars as pl
+        if isinstance(data, pl.DataFrame):
+            data = core.Data({"values": data.write_json(row_oriented=True)})
+
     # if data is still not a recognized type, then return
     if not isinstance(data, (dict, core.Data)):
         warnings.warn("data of type {} not recognized".format(type(data)))

From 492c5926baef365db33b8f6a2aafa07f262254ab Mon Sep 17 00:00:00 2001
From: Christopher Davis <christopherjdavis@gmail.com>
Date: Tue, 14 Feb 2023 16:20:29 -0800
Subject: [PATCH 2/9] black formatting

---
 altair/vegalite/v5/api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py
index 36b5d6c16..989a0f222 100644
--- a/altair/vegalite/v5/api.py
+++ b/altair/vegalite/v5/api.py
@@ -109,8 +109,9 @@ def _prepare_data(data, context=None):
     if context is not None and data_transformers.consolidate_datasets:
         data = _consolidate_data(data, context)
 
-    if find_spec('polars'):
+    if find_spec("polars"):
         import polars as pl
+
         if isinstance(data, pl.DataFrame):
             data = core.Data({"values": data.write_json(row_oriented=True)})
 

From 1fbb7c104fe31b9d5e8cc1cf57a4178695ceefe5 Mon Sep 17 00:00:00 2001
From: mattijn <mattijn@gmail.com>
Date: Thu, 16 Feb 2023 00:13:05 +0100
Subject: [PATCH 3/9] code review suggestions

---
 altair/utils/data.py      | 12 ++++++++++--
 altair/vegalite/v5/api.py | 15 ++++++---------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/altair/utils/data.py b/altair/utils/data.py
index 403b69e82..9669a8f2b 100644
--- a/altair/utils/data.py
+++ b/altair/utils/data.py
@@ -76,6 +76,9 @@ def limit_rows(data, max_rows=5000):
             values = data["values"]
         else:
             return data
+    elif hasattr(data, "__dataframe__"):
+        if "polars" in type(data).__module__:
+            values = data
     if max_rows is not None and len(values) > max_rows:
         raise MaxRowsError(
             "The number of rows in your dataset is greater "
@@ -152,12 +155,17 @@ def to_values(data):
         if "values" not in data:
             raise KeyError("values expected in data dict, but not present.")
         return data
+    elif hasattr(data, "__dataframe__"):
+        # only support for polars dataframe
+        if "polars" in type(data).__module__:
+            # currently no sanization on the data
+            return {"values": data.to_dicts()}
 
 
 def check_data_type(data):
     """Raise if the data is not a dict or DataFrame."""
-    if not isinstance(data, (dict, pd.DataFrame)) and not hasattr(
-        data, "__geo_interface__"
+    if not isinstance(data, (dict, pd.DataFrame)) and not any(
+        hasattr(data, attr) for attr in ["__geo_interface__", "__dataframe__"]
     ):
         raise TypeError(
             "Expected dict, DataFrame or a __geo_interface__ attribute, got: {}".format(
diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py
index 989a0f222..2f36c267d 100644
--- a/altair/vegalite/v5/api.py
+++ b/altair/vegalite/v5/api.py
@@ -8,7 +8,6 @@
 import pandas as pd
 from toolz.curried import pipe as _pipe
 import itertools
-from importlib.util import find_spec
 
 from .schema import core, channels, mixins, Undefined, SCHEMA_URL
 
@@ -98,23 +97,21 @@ def _prepare_data(data, context=None):
         return data
 
     # convert dataframes  or objects with __geo_interface__ to dict
-    if isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"):
+    elif isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"):
         data = _pipe(data, data_transformers.get())
 
     # convert string input to a URLData
-    if isinstance(data, str):
+    elif isinstance(data, str):
         data = core.UrlData(data)
 
+    elif hasattr(data, "__dataframe__"):
+        if "polars" in type(data).__module__:
+            data = _pipe(data, data_transformers.get())
+
     # consolidate inline data to top-level datasets
     if context is not None and data_transformers.consolidate_datasets:
         data = _consolidate_data(data, context)
 
-    if find_spec("polars"):
-        import polars as pl
-
-        if isinstance(data, pl.DataFrame):
-            data = core.Data({"values": data.write_json(row_oriented=True)})
-
     # if data is still not a recognized type, then return
     if not isinstance(data, (dict, core.Data)):
         warnings.warn("data of type {} not recognized".format(type(data)))

From 45bbbb7398e68e6c696d3af6cbfcb16addb6c803 Mon Sep 17 00:00:00 2001
From: mattijn <mattijn@gmail.com>
Date: Thu, 16 Feb 2023 08:23:03 +0100
Subject: [PATCH 4/9] improve code comment

---
 altair/utils/data.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/altair/utils/data.py b/altair/utils/data.py
index 9669a8f2b..665738b10 100644
--- a/altair/utils/data.py
+++ b/altair/utils/data.py
@@ -156,9 +156,12 @@ def to_values(data):
             raise KeyError("values expected in data dict, but not present.")
         return data
     elif hasattr(data, "__dataframe__"):
-        # only support for polars dataframe
+        # here we like to provide agnostic dataframe support
+        # we start with experimental support for polars dataframe
         if "polars" in type(data).__module__:
-            # currently no sanization on the data
+            # currently the polars function .to_dicts() is used to retrieve the data
+            # but here we would like to explore options using the .__dataframe__()
+            # currently no sanitization on the data
             return {"values": data.to_dicts()}
 
 

From e02b646f7e9eebf9fbd312221073426b0c136c09 Mon Sep 17 00:00:00 2001
From: mattijn <mattijn@gmail.com>
Date: Thu, 16 Feb 2023 21:32:05 +0100
Subject: [PATCH 5/9] support dataframe interchange format

---
 altair/utils/data.py      | 50 ++++++++++++++++++++++++++++++++-------
 altair/vegalite/v5/api.py |  3 +--
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/altair/utils/data.py b/altair/utils/data.py
index 665738b10..f915f458b 100644
--- a/altair/utils/data.py
+++ b/altair/utils/data.py
@@ -77,8 +77,7 @@ def limit_rows(data, max_rows=5000):
         else:
             return data
     elif hasattr(data, "__dataframe__"):
-        if "polars" in type(data).__module__:
-            values = data
+        values = data
     if max_rows is not None and len(values) > max_rows:
         raise MaxRowsError(
             "The number of rows in your dataset is greater "
@@ -101,6 +100,13 @@ def sample(data, n=None, frac=None):
             n = n if n else int(frac * len(values))
             values = random.sample(values, n)
             return {"values": values}
+    elif hasattr(data, "__dataframe__"):
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        pa_table = pi.from_dataframe(data)
+        n = n if n else int(frac * len(pa_table))
+        indices = random.sample(range(len(pa_table)), n)
+        return pa_table.take(indices)
 
 
 @curried.curry
@@ -156,13 +162,10 @@ def to_values(data):
             raise KeyError("values expected in data dict, but not present.")
         return data
     elif hasattr(data, "__dataframe__"):
-        # here we like to provide agnostic dataframe support
-        # we start with experimental support for polars dataframe
-        if "polars" in type(data).__module__:
-            # currently the polars function .to_dicts() is used to retrieve the data
-            # but here we would like to explore options using the .__dataframe__()
-            # currently no sanitization on the data
-            return {"values": data.to_dicts()}
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        pa_table = pi.from_dataframe(data)
+        return {"values": pa_table.to_pylist()}
 
 
 def check_data_type(data):
@@ -201,6 +204,11 @@ def _data_to_json_string(data):
         if "values" not in data:
             raise KeyError("values expected in data dict, but not present.")
         return json.dumps(data["values"], sort_keys=True)
+    elif hasattr(data, "__dataframe__"):
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        pa_table = pi.from_dataframe(data)
+        return json.dumps(pa_table.to_pylist())
     else:
         raise NotImplementedError(
             "to_json only works with data expressed as " "a DataFrame or as a dict"
@@ -222,6 +230,15 @@ def _data_to_csv_string(data):
         if "values" not in data:
             raise KeyError("values expected in data dict, but not present")
         return pd.DataFrame.from_dict(data["values"]).to_csv(index=False)
+    elif hasattr(data, "__dataframe__"):
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        import pyarrow as pa
+        import pyarrow.csv as pa_csv
+        pa_table = pi.from_dataframe(data)
+        csv_buffer = pa.BufferOutputStream()
+        pa_csv.write_csv(pa_table, csv_buffer)
+        return csv_buffer.getvalue().to_pybytes().decode()
     else:
         raise NotImplementedError(
             "to_csv only works with data expressed as " "a DataFrame or as a dict"
@@ -253,3 +270,18 @@ def curry(*args, **kwargs):
         AltairDeprecationWarning,
     )
     return curried.curry(*args, **kwargs)
+
+def import_pyarrow_interchange():
+    import pkg_resources
+    try:
+        pkg_resources.require("pyarrow>=11.0.0")
+        # The package is installed and meets the minimum version requirement
+        import pyarrow.interchange as pi
+        return pi
+    except pkg_resources.DistributionNotFound:
+        # The package is not installed
+        raise ImportError("The package 'pyarrow' is required, but not installed")
+    except pkg_resources.VersionConflict:
+        # The package is installed but does not meet the minimum version requirement
+        raise ImportError("The installed version of 'pyarrow' does not meet "
+                          "the minimum requirement of version 11.0.0.")
\ No newline at end of file
diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py
index 2f36c267d..478c3e5d0 100644
--- a/altair/vegalite/v5/api.py
+++ b/altair/vegalite/v5/api.py
@@ -105,8 +105,7 @@ def _prepare_data(data, context=None):
         data = core.UrlData(data)
 
     elif hasattr(data, "__dataframe__"):
-        if "polars" in type(data).__module__:
-            data = _pipe(data, data_transformers.get())
+        data = _pipe(data, data_transformers.get())
 
     # consolidate inline data to top-level datasets
     if context is not None and data_transformers.consolidate_datasets:

From 8db81cd465744e2d6c19bc757a0e3f3c53c4698e Mon Sep 17 00:00:00 2001
From: mattijn <mattijn@gmail.com>
Date: Thu, 16 Feb 2023 21:39:10 +0100
Subject: [PATCH 6/9] linting

---
 altair/utils/data.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/altair/utils/data.py b/altair/utils/data.py
index f915f458b..8a46fb357 100644
--- a/altair/utils/data.py
+++ b/altair/utils/data.py
@@ -235,6 +235,7 @@ def _data_to_csv_string(data):
         pi = import_pyarrow_interchange()
         import pyarrow as pa
         import pyarrow.csv as pa_csv
+
         pa_table = pi.from_dataframe(data)
         csv_buffer = pa.BufferOutputStream()
         pa_csv.write_csv(pa_table, csv_buffer)
@@ -271,17 +272,22 @@ def curry(*args, **kwargs):
     )
     return curried.curry(*args, **kwargs)
 
+
 def import_pyarrow_interchange():
     import pkg_resources
+
     try:
         pkg_resources.require("pyarrow>=11.0.0")
         # The package is installed and meets the minimum version requirement
         import pyarrow.interchange as pi
+
         return pi
     except pkg_resources.DistributionNotFound:
         # The package is not installed
         raise ImportError("The package 'pyarrow' is required, but not installed")
     except pkg_resources.VersionConflict:
         # The package is installed but does not meet the minimum version requirement
-        raise ImportError("The installed version of 'pyarrow' does not meet "
-                          "the minimum requirement of version 11.0.0.")
\ No newline at end of file
+        raise ImportError(
+            "The installed version of 'pyarrow' does not meet "
+            "the minimum requirement of version 11.0.0."
+        )

From 75e931f9685d7add6cd67424a9119c237e5c4b0e Mon Sep 17 00:00:00 2001
From: mattijn <mattijn@gmail.com>
Date: Sat, 18 Feb 2023 11:24:10 +0100
Subject: [PATCH 7/9] include info in release notes and data section

---
 doc/releases/changes.rst | 1 +
 doc/user_guide/data.rst  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst
index 02978562e..2ba04d4c2 100644
--- a/doc/releases/changes.rst
+++ b/doc/releases/changes.rst
@@ -22,6 +22,7 @@ Enhancements
 - The documentation page has been revamped, both in terms of appearance and content.
 - More informative autocompletion by removing deprecated methods (#2814) and adding support for completion in method chains for editors that rely on type hints (e.g. VS Code) (#2846)
 - Improved error messages (#2842)
+- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__`. This is dependent on `pyarrow>=11.0.0` (#2888)
 
 Grammar Changes
 ~~~~~~~~~~~~~~~
diff --git a/doc/user_guide/data.rst b/doc/user_guide/data.rst
index 8ac86cdc2..d89cd8d21 100644
--- a/doc/user_guide/data.rst
+++ b/doc/user_guide/data.rst
@@ -21,6 +21,7 @@ there are many different ways of specifying a dataset:
 - as a url string pointing to a ``json`` or ``csv`` formatted text file
 - as a `geopandas GeoDataFrame <http://geopandas.org/data_structures.html#geodataframe>`_, `Shapely Geometries <https://shapely.readthedocs.io/en/latest/manual.html#geometric-objects>`_, `GeoJSON Objects <https://github.com/jazzband/geojson#geojson-objects>`_ or other objects that support the ``__geo_interface__``
 - as a generated dataset such as numerical sequences or geographic reference elements
+- as a DataFrame that supports the DataFrame Interchange Protocol (contains a `__dataframe__` attribute). This is experimental.
 
 When data is specified as a DataFrame, the encoding is quite simple, as Altair
 uses the data type information provided by pandas to automatically determine

From 17f51f98556c5086e1526c20904e8cb058ce6e36 Mon Sep 17 00:00:00 2001
From: mattijn <mattijn@gmail.com>
Date: Sat, 18 Feb 2023 11:25:20 +0100
Subject: [PATCH 8/9] fix typos in sentece

---
 doc/releases/changes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst
index 2ba04d4c2..8e3f25df5 100644
--- a/doc/releases/changes.rst
+++ b/doc/releases/changes.rst
@@ -22,7 +22,7 @@ Enhancements
 - The documentation page has been revamped, both in terms of appearance and content.
 - More informative autocompletion by removing deprecated methods (#2814) and adding support for completion in method chains for editors that rely on type hints (e.g. VS Code) (#2846)
 - Improved error messages (#2842)
-- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__`. This is dependent on `pyarrow>=11.0.0` (#2888)
+- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__` attribute). This requires `pyarrow>=11.0.0` (#2888).
 
 Grammar Changes
 ~~~~~~~~~~~~~~~

From e0cda9e29ff96b3d3739da60fc0c8383706c691e Mon Sep 17 00:00:00 2001
From: mattijn <mattijn@gmail.com>
Date: Sat, 18 Feb 2023 20:10:25 +0100
Subject: [PATCH 9/9] improve error messages

---
 altair/utils/data.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/altair/utils/data.py b/altair/utils/data.py
index 8a46fb357..9b680d060 100644
--- a/altair/utils/data.py
+++ b/altair/utils/data.py
@@ -284,10 +284,12 @@ def import_pyarrow_interchange():
         return pi
     except pkg_resources.DistributionNotFound:
         # The package is not installed
-        raise ImportError("The package 'pyarrow' is required, but not installed")
+        raise ImportError(
+            "Usage of the DataFrame Interchange Protocol requires the package 'pyarrow', but it is not installed."
+        )
     except pkg_resources.VersionConflict:
         # The package is installed but does not meet the minimum version requirement
         raise ImportError(
-            "The installed version of 'pyarrow' does not meet "
-            "the minimum requirement of version 11.0.0."
+            "The installed version of 'pyarrow' does not meet the minimum requirement of version 11.0.0. "
+            "Please update 'pyarrow' to use the DataFrame Interchange Protocol."
         )