huggingface · lhoestq · Mar 8, 2024 · Dec 24, 2023 · Dec 24, 2023 · Dec 24, 2023
diff --git a/setup.py b/setup.py
@@ -185,6 +185,7 @@
     "transformers",
     "typing-extensions>=4.6.1",  # due to conflict between apache-beam and pydantic
     "zstandard",
+    "polars[timezone]>=0.20.0",
 ]
 
 

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -134,6 +134,7 @@
 if TYPE_CHECKING:
     import sqlite3
 
+    import polars as pl
     import pyspark
     import sqlalchemy
 
@@ -868,6 +869,48 @@ def from_pandas(
             table = table.cast(features.arrow_schema)
         return cls(table, info=info, split=split)
 
+    @classmethod
+    def from_polars(
+        cls,
+        df: "pl.DataFrame",
+        features: Optional[Features] = None,
+        info: Optional[DatasetInfo] = None,
+        split: Optional[NamedSplit] = None,
+    ) -> "Dataset":
+        """
+        Collect the underlying arrow arrays in an Arrow Table.
+
+        This operation is mostly zero copy.
+
+        Data types that do copy:
+            * CategoricalType
+
+        Args:
+            df (`polars.DataFrame`): DataFrame to convert to Arrow Table
+            features (`Features`, optional): Dataset features.
+            info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.
+            split (`NamedSplit`, optional): Name of the dataset split.
+
+        Examples:
+        ```py
+        >>> ds = Dataset.from_polars(df)
+        ```
+        """
+        if info is not None and features is not None and info.features != features:
+            raise ValueError(
+                f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
+            )
+        features = features if features is not None else info.features if info is not None else None
+        if info is None:
+            info = DatasetInfo()
+        info.features = features
+        table = InMemoryTable(df.to_arrow())
+        if features is not None:
+            # more expensive cast than InMemoryTable.from_polars(..., schema=features.arrow_schema)
+            # needed to support the str to Audio conversion for instance
+            table = table.cast(features.arrow_schema)
+        return cls(table, info=info, split=split)
+
     @classmethod
     def from_dict(
         cls,
@@ -3319,6 +3362,10 @@ def validate_function_output(processed_inputs, indices):
                 )
             elif isinstance(indices, list) and isinstance(processed_inputs, Mapping):
                 allowed_batch_return_types = (list, np.ndarray, pd.Series)
+                if config.POLARS_AVAILABLE and "polars" in sys.modules:
+                    import polars as pl
+
+                    allowed_batch_return_types += (pl.Series, pl.DataFrame)
                 if config.TF_AVAILABLE and "tensorflow" in sys.modules:
                     import tensorflow as tf
 
@@ -3438,6 +3485,10 @@ def init_buffer_and_writer():
         # If `update_data` is True after processing the first example/batch, initalize these resources with `init_buffer_and_writer`
         buf_writer, writer, tmp_file = None, None, None
 
+        # Check if Polars is available and import it if so
+        if config.POLARS_AVAILABLE and "polars" in sys.modules:
+            import polars as pl
+
         # Optionally initialize the writer as a context manager
         with contextlib.ExitStack() as stack:
             try:
@@ -3464,6 +3515,12 @@ def init_buffer_and_writer():
                                 writer.write_row(example)
                             elif isinstance(example, pd.DataFrame):
                                 writer.write_row(pa.Table.from_pandas(example))
+                            elif (
+                                config.POLARS_AVAILABLE
+                                and "polars" in sys.modules
+                                and isinstance(example, pl.DataFrame)
+                            ):
+                                writer.write_row(example.to_arrow())
                             else:
                                 writer.write(example)
                         num_examples_progress_update += 1
@@ -3497,6 +3554,10 @@ def init_buffer_and_writer():
                                 writer.write_table(batch)
                             elif isinstance(batch, pd.DataFrame):
                                 writer.write_table(pa.Table.from_pandas(batch))
+                            elif (
+                                config.POLARS_AVAILABLE and "polars" in sys.modules and isinstance(batch, pl.DataFrame)
+                            ):
+                                writer.write_table(batch.to_arrow())
                             else:
                                 writer.write_batch(batch)
                         num_examples_progress_update += num_examples_in_batch
@@ -4949,6 +5010,54 @@ def to_pandas(
                 for offset in range(0, len(self), batch_size)
             )
 
+    def to_polars(
+        self, batch_size: Optional[int] = None, batched: bool = False
+    ) -> Union["pl.DataFrame", Iterator["pl.DataFrame"]]:
+        """Returns the dataset as a `polars.DataFrame`. Can also return a generator for large datasets.
+
+        Args:
+            batched (`bool`):
+                Set to `True` to return a generator that yields the dataset as batches
+                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).
+            batch_size (`int`, *optional*):
+                The size (number of rows) of the batches if `batched` is `True`.
+                Defaults to `genomicsml.datasets.config.DEFAULT_MAX_BATCH_SIZE`.
+
+        Returns:
+            `polars.DataFrame` or `Iterator[polars.DataFrame]`
+
+        Example:
+
+        ```py
+        >>> ds.to_polars()
+        ```
+        """
+        if config.POLARS_AVAILABLE:
+            import polars as pl
+
+            if not batched:
+                return pl.from_arrow(
+                    query_table(
+                        table=self._data,
+                        key=slice(0, len(self)),
+                        indices=self._indices if self._indices is not None else None,
+                    )
+                )
+            else:
+                batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE
+                return (
+                    pl.from_arrow(
+                        query_table(
+                            table=self._data,
+                            key=slice(offset, offset + batch_size),
+                            indices=self._indices if self._indices is not None else None,
+                        )
+                    )
+                    for offset in range(0, len(self), batch_size)
+                )
+        else:
+            raise ValueError("Polars needs to be installed to be able to return Polars dataframes.")
+
     def to_parquet(
         self,
         path_or_buf: Union[PathLike, BinaryIO],

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -61,6 +61,16 @@
 else:
     logger.info("Disabling PyTorch because USE_TF is set")
 
+POLARS_VERSION = "N/A"
+POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None
+
+if POLARS_AVAILABLE:
+    try:
+        POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
+        logger.info(f"Polars version {POLARS_VERSION} available.")
+    except importlib.metadata.PackageNotFoundError:
+        pass
+
 TF_VERSION = "N/A"
 TF_AVAILABLE = False
 

diff --git a/src/datasets/formatting/__init__.py b/src/datasets/formatting/__init__.py
@@ -80,6 +80,14 @@ def _register_unavailable_formatter(
 _register_formatter(PandasFormatter, "pandas", aliases=["pd"])
 _register_formatter(CustomFormatter, "custom")
 
+if config.POLARS_AVAILABLE:
+    from .polars_formatter import PolarsFormatter
+
+    _register_formatter(PolarsFormatter, "polars", aliases=["pl"])
+else:
+    _polars_error = ValueError("Polars needs to be installed to be able to return Polars dataframes.")
+    _register_unavailable_formatter(_polars_error, "polars", aliases=["pl"])
+
 if config.TORCH_AVAILABLE:
     from .torch_formatter import TorchFormatter
 

diff --git a/src/datasets/formatting/polars_formatter.py b/src/datasets/formatting/polars_formatter.py
@@ -0,0 +1,122 @@
+# Copyright 2020 The HuggingFace Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from collections.abc import Mapping
+from functools import partial
+from typing import TYPE_CHECKING, Optional
+
+import pyarrow as pa
+
+from .. import config
+from ..features import Features
+from ..features.features import decode_nested_example
+from ..utils.py_utils import no_op_if_value_is_null
+from .formatting import BaseArrowExtractor, TensorFormatter
+
+
+if TYPE_CHECKING:
+    import polars as pl
+
+
+class PolarsArrowExtractor(BaseArrowExtractor["pl.DataFrame", "pl.Series", "pl.DataFrame"]):
+    def extract_row(self, pa_table: pa.Table) -> "pl.DataFrame":
+        if config.POLARS_AVAILABLE:
+            if "polars" not in sys.modules:
+                import polars
+            else:
+                polars = sys.modules["polars"]
+
+            return polars.from_arrow(pa_table.slice(length=1))
+        else:
+            raise ValueError("Polars needs to be installed to be able to return Polars dataframes.")
+
+    def extract_column(self, pa_table: pa.Table) -> "pl.Series":
+        if config.POLARS_AVAILABLE:
+            if "polars" not in sys.modules:
+                import polars
+            else:
+                polars = sys.modules["polars"]
+
+            return polars.from_arrow(pa_table.select([0]))[pa_table.column_names[0]]
+        else:
+            raise ValueError("Polars needs to be installed to be able to return Polars dataframes.")
+
+    def extract_batch(self, pa_table: pa.Table) -> "pl.DataFrame":
+        if config.POLARS_AVAILABLE:
+            if "polars" not in sys.modules:
+                import polars
+            else:
+                polars = sys.modules["polars"]
+
+            return polars.from_arrow(pa_table)
+        else:
+            raise ValueError("Polars needs to be installed to be able to return Polars dataframes.")
+
+
+class PolarsFeaturesDecoder:
+    def __init__(self, features: Optional[Features]):
+        self.features = features
+        import polars as pl  # noqa: F401 - import pl at initialization
+
+    def decode_row(self, row: "pl.DataFrame") -> "pl.DataFrame":
+        decode = (
+            {
+                column_name: no_op_if_value_is_null(partial(decode_nested_example, feature))
+                for column_name, feature in self.features.items()
+                if self.features._column_requires_decoding[column_name]
+            }
+            if self.features
+            else {}
+        )
+        if decode:
+            row[list(decode.keys())] = row.map_rows(decode)
+        return row
+
+    def decode_column(self, column: "pl.Series", column_name: str) -> "pl.Series":
+        decode = (
+            no_op_if_value_is_null(partial(decode_nested_example, self.features[column_name]))
+            if self.features and column_name in self.features and self.features._column_requires_decoding[column_name]
+            else None
+        )
+        if decode:
+            column = column.map_elements(decode)
+        return column
+
+    def decode_batch(self, batch: "pl.DataFrame") -> "pl.DataFrame":
+        return self.decode_row(batch)
+
+
+class PolarsFormatter(TensorFormatter[Mapping, "pl.DataFrame", Mapping]):
+    def __init__(self, features=None, **np_array_kwargs):
+        super().__init__(features=features)
+        self.np_array_kwargs = np_array_kwargs
+        self.polars_arrow_extractor = PolarsArrowExtractor
+        self.polars_features_decoder = PolarsFeaturesDecoder(features)
+        import polars as pl  # noqa: F401 - import pl at initialization
+
+    def format_row(self, pa_table: pa.Table) -> "pl.DataFrame":
+        row = self.polars_arrow_extractor().extract_row(pa_table)
+        row = self.polars_features_decoder.decode_row(row)
+        return row
+
+    def format_column(self, pa_table: pa.Table) -> "pl.Series":
+        column = self.polars_arrow_extractor().extract_column(pa_table)
+        column = self.polars_features_decoder.decode_column(column, pa_table.column_names[0])
+        return column
+
+    def format_batch(self, pa_table: pa.Table) -> "pl.DataFrame":
+        row = self.polars_arrow_extractor().extract_batch(pa_table)
+        row = self.polars_features_decoder.decode_batch(row)
+        return row