From 49f71a890af7079857aba848584fecd5b462b394 Mon Sep 17 00:00:00 2001 From: Jakob Keller <57402305+jakob-keller@users.noreply.github.com> Date: Sat, 18 Feb 2023 06:55:42 +0100 Subject: [PATCH 01/10] perf(python): Improve performance of `expr_to_lit_or_expr` for arguments of type `Expr` by ~80% (#6967) --- py-polars/polars/internals/expr/expr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py index a4d6065dc166c..25a7a16f09f52 100644 --- a/py-polars/polars/internals/expr/expr.py +++ b/py-polars/polars/internals/expr/expr.py @@ -110,7 +110,9 @@ def expr_to_lit_or_expr( Expr """ - if isinstance(expr, str) and not str_to_lit: + if isinstance(expr, Expr): + pass + elif isinstance(expr, str) and not str_to_lit: expr = pli.col(expr) elif ( isinstance(expr, (int, float, str, pli.Series, datetime, date, time, timedelta)) @@ -123,7 +125,7 @@ def expr_to_lit_or_expr( structify = False elif isinstance(expr, (pli.WhenThen, pli.WhenThenThen)): expr = expr.otherwise(None) # implicitly add the null branch. - elif not isinstance(expr, Expr): + else: raise TypeError( f"did not expect value {expr} of type {type(expr)}, maybe disambiguate with" " pl.lit or pl.col" From 5098966e590ab6752bfc60027646fd43c192dc8e Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Sat, 18 Feb 2023 18:34:02 +0400 Subject: [PATCH 02/10] fix(python): if given, respect dtype timeunit when instantiating `pl.lit` value (#6991) --- py-polars/polars/internals/lazy_functions.py | 5 ++- py-polars/tests/unit/test_exprs.py | 41 +++++++++++++++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/internals/lazy_functions.py b/py-polars/polars/internals/lazy_functions.py index 2e06f46343b64..948d275bff098 100644 --- a/py-polars/polars/internals/lazy_functions.py +++ b/py-polars/polars/internals/lazy_functions.py @@ -1127,8 +1127,9 @@ def lit( """ tu: TimeUnit + if isinstance(value, datetime): - tu = "us" + tu = "us" if dtype is None else getattr(dtype, "tu", "us") e = lit(_datetime_to_pl_timestamp(value, tu)).cast(Datetime(tu)) if value.tzinfo is not None: return e.dt.replace_time_zone(str(value.tzinfo)) @@ -1136,7 +1137,7 @@ def lit( return e elif isinstance(value, timedelta): - tu = "us" + tu = "us" if dtype is None else getattr(dtype, "tu", "us") return lit(_timedelta_to_pl_timedelta(value, tu)).cast(Duration(tu)) elif isinstance(value, time): diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index f86bb5cfe131d..30a21505935eb 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -2,7 +2,8 @@ import random import typing -from typing import cast +from datetime import datetime, timedelta +from typing import Any, cast import numpy as np import pytest @@ -15,6 +16,7 @@ INTEGER_DTYPES, NUMERIC_DTYPES, TEMPORAL_DTYPES, + PolarsDataType, ) from polars.testing import assert_frame_equal, assert_series_equal @@ -520,3 +522,40 @@ def test_map_dict() -> None: "country_code": ["FR", None, "ES", "DE"], "remapped": ["France", "Not specified", "2", "Germany"], } + + +def test_lit_dtypes() -> None: + def lit_series(value: Any, dtype: PolarsDataType) -> pl.Series: + return pl.select(pl.lit(value, dtype=dtype)).to_series() + + d = datetime(2049, 10, 5, 1, 2, 3, 987654) + d_ms = datetime(2049, 10, 5, 1, 2, 3, 987000) + + td = timedelta(days=942, hours=6, microseconds=123456) + td_ms = timedelta(days=942, seconds=21600, microseconds=123000) + + df = pl.DataFrame( + { + "dtm_ms": lit_series(d, pl.Datetime("ms")), + "dtm_us": lit_series(d, pl.Datetime("us")), + "dtm_ns": lit_series(d, pl.Datetime("ns")), + "dur_ms": lit_series(td, pl.Duration("ms")), + "dur_us": lit_series(td, pl.Duration("us")), + "dur_ns": lit_series(td, pl.Duration("ns")), + "f32": lit_series(0, pl.Float32), + "u16": lit_series(0, pl.UInt16), + "i16": lit_series(0, pl.Int16), + } + ) + assert df.dtypes == [ + pl.Datetime("ms"), + pl.Datetime("us"), + pl.Datetime("ns"), + pl.Duration("ms"), + pl.Duration("us"), + pl.Duration("ns"), + pl.Float32, + pl.UInt16, + pl.Int16, + ] + assert df.row(0) == (d_ms, d, d, td_ms, td, td, 0, 0, 0) From 5fba83918eff167819e8bd97060673d2882261a9 Mon Sep 17 00:00:00 2001 From: J van Zundert Date: Sat, 18 Feb 2023 14:34:34 +0000 Subject: [PATCH 03/10] fix(python): Remove check for path to be non-directory if use_pyarrow (#6994) --- py-polars/polars/internals/io.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/py-polars/polars/internals/io.py b/py-polars/polars/internals/io.py index 1e6c6f5e3ebb9..1fca312813488 100644 --- a/py-polars/polars/internals/io.py +++ b/py-polars/polars/internals/io.py @@ -101,6 +101,10 @@ def managed_file(file: Any) -> Iterator[Any]: ) encoding_str = encoding if encoding else "utf8" + # PyArrow allows directories, so we only check that something is not + # a dir if we are not using PyArrow + check_not_dir = not use_pyarrow + if isinstance(file, bytes): if has_non_utf8_non_utf8_lossy_encoding: return _check_empty( @@ -138,7 +142,7 @@ def managed_file(file: Any) -> Iterator[Any]: BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")), context=f"Path ({file!r})", ) - return managed_file(normalise_filepath(file)) + return managed_file(normalise_filepath(file, check_not_dir)) if isinstance(file, str): # make sure that this is before fsspec @@ -151,7 +155,7 @@ def managed_file(file: Any) -> Iterator[Any]: if not has_non_utf8_non_utf8_lossy_encoding: if infer_storage_options(file)["protocol"] == "file": - return managed_file(normalise_filepath(file)) + return managed_file(normalise_filepath(file, check_not_dir)) kwargs["encoding"] = encoding return fsspec.open(file, **kwargs) @@ -161,12 +165,14 @@ def managed_file(file: Any) -> Iterator[Any]: if not has_non_utf8_non_utf8_lossy_encoding: if all(infer_storage_options(f)["protocol"] == "file" for f in file): - return managed_file([normalise_filepath(f) for f in file]) + return managed_file( + [normalise_filepath(f, check_not_dir) for f in file] + ) kwargs["encoding"] = encoding return fsspec.open_files(file, **kwargs) if isinstance(file, str): - file = normalise_filepath(file) + file = normalise_filepath(file, check_not_dir) if has_non_utf8_non_utf8_lossy_encoding: with open(file, encoding=encoding_str) as f: return _check_empty( From 69314ea903f8458e8ac0147ef6012332151627b5 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 18 Feb 2023 15:35:56 +0100 Subject: [PATCH 04/10] feat(python): More ergonomic `col` args (#6996) --- py-polars/polars/internals/lazy_functions.py | 139 +++++++++++-------- py-polars/tests/unit/test_lazy.py | 29 ++++ 2 files changed, 110 insertions(+), 58 deletions(-) diff --git a/py-polars/polars/internals/lazy_functions.py b/py-polars/polars/internals/lazy_functions.py index 948d275bff098..3e696c28d0b8f 100644 --- a/py-polars/polars/internals/lazy_functions.py +++ b/py-polars/polars/internals/lazy_functions.py @@ -8,8 +8,6 @@ from polars import internals as pli from polars.datatypes import ( DTYPE_TEMPORAL_UNITS, - DataType, - DataTypeClass, Date, Datetime, Duration, @@ -81,33 +79,25 @@ def col( - name: ( - str - | Sequence[str] - | Sequence[PolarsDataType] - | set[PolarsDataType] - | frozenset[PolarsDataType] - | pli.Series - | PolarsDataType - ), + name: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], + *more_names: str | PolarsDataType, ) -> pli.Expr: """ - Return an expression representing a column in a DataFrame. - - Can be used to select: - - - a single column by name - - all columns by using a wildcard `"*"` - - column by regular expression if the regex starts with `^` and ends with `$` - - all columns with the same dtype by using a Polars type + Return an expression representing column(s) in a dataframe. Parameters ---------- name - A string that holds the name of the column + The name or datatype of the column(s) to represent. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_names + Additional names or datatypes of columns to represent, specified as positional + arguments. Examples -------- + Pass a single column name to represent that column. + >>> df = pl.DataFrame( ... { ... "ham": [1, 2, 3], @@ -127,6 +117,9 @@ def col( │ 2 │ │ 1 │ └─────┘ + + Use the wildcard ``*`` to represent all columns. + >>> df.select(pl.col("*")) shape: (3, 4) ┌─────┬───────────┬─────┬─────┐ @@ -138,17 +131,6 @@ def col( │ 2 ┆ 22 ┆ 2 ┆ b │ │ 3 ┆ 33 ┆ 1 ┆ c │ └─────┴───────────┴─────┴─────┘ - >>> df.select(pl.col("^ham.*$")) - shape: (3, 2) - ┌─────┬───────────┐ - │ ham ┆ hamburger │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════════╡ - │ 1 ┆ 11 │ - │ 2 ┆ 22 │ - │ 3 ┆ 33 │ - └─────┴───────────┘ >>> df.select(pl.col("*").exclude("ham")) shape: (3, 3) ┌───────────┬─────┬─────┐ @@ -160,6 +142,23 @@ def col( │ 22 ┆ 2 ┆ b │ │ 33 ┆ 1 ┆ c │ └───────────┴─────┴─────┘ + + Regular expression input is supported. + + >>> df.select(pl.col("^ham.*$")) + shape: (3, 2) + ┌─────┬───────────┐ + │ ham ┆ hamburger │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 11 │ + │ 2 ┆ 22 │ + │ 3 ┆ 33 │ + └─────┴───────────┘ + + Multiple columns can be represented by passing a list of names. + >>> df.select(pl.col(["hamburger", "foo"])) shape: (3, 2) ┌───────────┬─────┐ @@ -171,7 +170,23 @@ def col( │ 22 ┆ 2 │ │ 33 ┆ 1 │ └───────────┴─────┘ - >>> # Select columns with a dtype + + Or use positional arguments to represent multiple columns in the same way. + + >>> df.select(pl.col("hamburger", "foo")) + shape: (3, 2) + ┌───────────┬─────┐ + │ hamburger ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═════╡ + │ 11 ┆ 3 │ + │ 22 ┆ 2 │ + │ 33 ┆ 1 │ + └───────────┴─────┘ + + Easily select all columns that match a certain data type by passing that datatype. + >>> df.select(pl.col(pl.Utf8)) shape: (3, 1) ┌─────┐ @@ -183,8 +198,7 @@ def col( │ b │ │ c │ └─────┘ - >>> # Select columns from a list of dtypes - >>> df.select(pl.col([pl.Int64, pl.Float64])) + >>> df.select(pl.col(pl.Int64, pl.Float64)) shape: (3, 3) ┌─────┬───────────┬─────┐ │ ham ┆ hamburger ┆ foo │ @@ -197,33 +211,42 @@ def col( └─────┴───────────┴─────┘ """ - if isinstance(name, pli.Series): - name = name.to_list() - - if isinstance(name, DataTypeClass): - name = [name] + if more_names: + if isinstance(name, str): + names_str = [name] + names_str.extend(more_names) # type: ignore[arg-type] + return pli.wrap_expr(pycols(names_str)) + elif is_polars_dtype(name): + dtypes = [name] + dtypes.extend(more_names) + return pli.wrap_expr(_dtype_cols(dtypes)) + raise TypeError( + f"Invalid input for `col`. Expected `str` or `DataType`, got {type(name)!r}" + ) - if isinstance(name, DataType): + if isinstance(name, str): + return pli.wrap_expr(pycol(name)) + elif is_polars_dtype(name): return pli.wrap_expr(_dtype_cols([name])) - - elif not isinstance(name, str) and isinstance( - name, (list, tuple, set, frozenset, Sequence) - ): - if len(name) == 0: - return pli.wrap_expr(pycols(name)) + elif isinstance(name, Iterable): + names = list(name) + if not names: + return pli.wrap_expr(pycols(names)) + + item = names[0] + if isinstance(item, str): + return pli.wrap_expr(pycols(names)) + elif is_polars_dtype(item): + return pli.wrap_expr(_dtype_cols(names)) else: - names = list(name) - item = names[0] - if isinstance(item, str): - return pli.wrap_expr(pycols(names)) - elif is_polars_dtype(item): - return pli.wrap_expr(_dtype_cols(names)) - else: - raise ValueError( - "Expected list values to be all `str` or all `DataType`" - ) - - return pli.wrap_expr(pycol(name)) + raise TypeError( + "Invalid input for `col`. Expected iterable of type `str` or `DataType`," + f" got iterable of type {type(item)!r}" + ) + else: + raise TypeError( + f"Invalid input for `col`. Expected `str` or `DataType`, got {type(name)!r}" + ) def element() -> pli.Expr: diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 92e6d5b9f6a63..f1b4204661395 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -1665,3 +1665,32 @@ def test_cumagg_types() -> None: assert cumprod_lf.schema["c"] == pl.Float64 collected_cumprod_lf = cumprod_lf.collect() assert collected_cumprod_lf.schema == cumprod_lf.schema + + +def test_col() -> None: + df = pl.DataFrame( + { + "ham": [1, 2, 3], + "hamburger": [11, 22, 33], + "foo": [3, 2, 1], + "bar": ["a", "b", "c"], + } + ) + + # Single column + assert df.select(pl.col("foo")).columns == ["foo"] + # Regex + assert df.select(pl.col("*")).columns == ["ham", "hamburger", "foo", "bar"] + assert df.select(pl.col("^ham.*$")).columns == ["ham", "hamburger"] + assert df.select(pl.col("*").exclude("ham")).columns == ["hamburger", "foo", "bar"] + # Multiple inputs + assert df.select(pl.col(["hamburger", "foo"])).columns == ["hamburger", "foo"] + assert df.select(pl.col("hamburger", "foo")).columns == ["hamburger", "foo"] + assert df.select(pl.col(pl.Series(["ham", "foo"]))).columns == ["ham", "foo"] + # Dtypes + assert df.select(pl.col(pl.Utf8)).columns == ["bar"] + assert df.select(pl.col(pl.Int64, pl.Float64)).columns == [ + "ham", + "hamburger", + "foo", + ] From 98f664dcd0f4242135b219a4066d72a3f2602e2f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 18 Feb 2023 16:05:37 +0100 Subject: [PATCH 05/10] fix(rust, python): fix is_duplicated for utf8 dtype (#6997) --- polars/polars-ops/src/series/ops/is_unique.rs | 3 ++- py-polars/tests/unit/test_series.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/polars/polars-ops/src/series/ops/is_unique.rs b/polars/polars-ops/src/series/ops/is_unique.rs index bb63e75c6ffaf..e177745fac44f 100644 --- a/polars/polars-ops/src/series/ops/is_unique.rs +++ b/polars/polars-ops/src/series/ops/is_unique.rs @@ -56,7 +56,8 @@ fn dispatcher(s: &Series, invert: bool) -> PolarsResult { #[cfg(feature = "dtype-binary")] Utf8 => { let s = s.cast(&Binary).unwrap(); - return is_unique(&s); + let ca = s.binary().unwrap(); + is_unique_ca(ca, invert) } Float32 => { let ca = s.bit_repr_small(); diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/test_series.py index e8ef0d2c0a75a..c79185ad11499 100644 --- a/py-polars/tests/unit/test_series.py +++ b/py-polars/tests/unit/test_series.py @@ -1688,6 +1688,20 @@ def test_is_unique() -> None: s = pl.Series("a", [1, 2, 2, 3]) assert_series_equal(s.is_unique(), pl.Series("a", [True, False, False, True])) + # utf8 + assert pl.Series(["a", "b", "c", "a"]).is_duplicated().to_list() == [ + True, + False, + False, + True, + ] + assert pl.Series(["a", "b", "c", "a"]).is_unique().to_list() == [ + False, + True, + True, + False, + ] + def test_is_duplicated() -> None: s = pl.Series("a", [1, 2, 2, 3]) From 8abf20076d90b49b9e3096f786d9b87cc2247b76 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Sat, 18 Feb 2023 19:39:05 +0400 Subject: [PATCH 06/10] feat(python): add `upper_bound` and `lower_bound` methods to `Series` (#6990) --- .../source/reference/series/descriptive.rst | 2 + py-polars/polars/internals/series/series.py | 106 +++++++++++++----- py-polars/tests/unit/conftest.py | 4 +- .../tests/unit/operations/test_groupby.py | 16 ++- py-polars/tests/unit/test_lazy.py | 10 +- py-polars/tests/unit/test_series.py | 23 ++++ 6 files changed, 127 insertions(+), 34 deletions(-) diff --git a/py-polars/docs/source/reference/series/descriptive.rst b/py-polars/docs/source/reference/series/descriptive.rst index e5e4d89033ae8..cecb5c21daa2e 100644 --- a/py-polars/docs/source/reference/series/descriptive.rst +++ b/py-polars/docs/source/reference/series/descriptive.rst @@ -28,8 +28,10 @@ Descriptive Series.is_unique Series.is_utf8 Series.len + Series.lower_bound Series.n_chunks Series.n_unique Series.null_count Series.unique_counts + Series.upper_bound Series.value_counts diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py index 00259e896b21d..67e19caa61c48 100644 --- a/py-polars/polars/internals/series/series.py +++ b/py-polars/polars/internals/series/series.py @@ -1619,30 +1619,25 @@ def n_chunks(self) -> int: """ return self._s.n_chunks() - def cumsum(self, reverse: bool = False) -> Series: + def cummax(self, reverse: bool = False) -> Series: """ - Get an array with the cumulative sum computed at every element. + Get an array with the cumulative max computed at every element. Parameters ---------- reverse reverse the operation. - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - Examples -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cumsum() + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() shape: (3,) - Series: 'a' [i64] + Series: 's' [i64] [ - 1 3 - 6 + 5 + 5 ] """ @@ -1658,10 +1653,10 @@ def cummin(self, reverse: bool = False) -> Series: Examples -------- - >>> s = pl.Series("a", [1, 2, 3]) + >>> s = pl.Series("s", [1, 2, 3]) >>> s.cummin() shape: (3,) - Series: 'a' [i64] + Series: 's' [i64] [ 1 1 @@ -1670,32 +1665,37 @@ def cummin(self, reverse: bool = False) -> Series: """ - def cummax(self, reverse: bool = False) -> Series: + def cumprod(self, reverse: bool = False) -> Series: """ - Get an array with the cumulative max computed at every element. + Get an array with the cumulative product computed at every element. Parameters ---------- reverse reverse the operation. + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + Examples -------- - >>> s = pl.Series("a", [3, 5, 1]) - >>> s.cummax() + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() shape: (3,) Series: 'a' [i64] [ - 3 - 5 - 5 + 1 + 2 + 6 ] """ - def cumprod(self, reverse: bool = False) -> Series: + def cumsum(self, reverse: bool = False) -> Series: """ - Get an array with the cumulative product computed at every element. + Get an array with the cumulative sum computed at every element. Parameters ---------- @@ -1710,12 +1710,12 @@ def cumprod(self, reverse: bool = False) -> Series: Examples -------- >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cumprod() + >>> s.cumsum() shape: (3,) Series: 'a' [i64] [ 1 - 2 + 3 6 ] @@ -4781,6 +4781,62 @@ def clip_max(self, max_val: int | float) -> Series: """ + def lower_bound(self) -> Self: + """ + Return the lower bound of this Series' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: 's' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: 's' [f32] + [ + -inf + ] + + """ + + def upper_bound(self) -> Self: + """ + Return the upper bound of this Series' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: 's' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: 's' [f64] + [ + inf + ] + + """ + def map_dict( self, remapping: dict[Any, Any], diff --git a/py-polars/tests/unit/conftest.py b/py-polars/tests/unit/conftest.py index 373cdbb178671..acd814d5b02e3 100644 --- a/py-polars/tests/unit/conftest.py +++ b/py-polars/tests/unit/conftest.py @@ -53,11 +53,13 @@ def fruits_cars() -> pl.DataFrame: "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - } + }, + schema_overrides={"A": pl.Int64, "B": pl.Int64}, ) ISO8601_FORMATS = [] + for T in ["T", " "]: for hms in ( [ diff --git a/py-polars/tests/unit/operations/test_groupby.py b/py-polars/tests/unit/operations/test_groupby.py index 5bae6300b46de..b8ef46b1f5357 100644 --- a/py-polars/tests/unit/operations/test_groupby.py +++ b/py-polars/tests/unit/operations/test_groupby.py @@ -558,7 +558,7 @@ def test_overflow_mean_partitioned_groupby_5194(dtype: pl.PolarsDataType) -> Non def test_groupby_dynamic_elementwise_following_mean_agg_6904() -> None: - assert ( + df = ( pl.DataFrame( { "a": [ @@ -571,7 +571,13 @@ def test_groupby_dynamic_elementwise_following_mean_agg_6904() -> None: .groupby_dynamic("a", every="10s", period="100s") .agg([pl.col("b").mean().sin().alias("c")]) .collect() - ).to_dict(False) == { - "a": [datetime(2021, 1, 1, 0, 0), datetime(2021, 1, 1, 0, 0, 10)], - "c": [0.9092974268256817, -0.7568024953079282], - } + ) + assert_frame_equal( + df, + pl.DataFrame( + { + "a": [datetime(2021, 1, 1, 0, 0), datetime(2021, 1, 1, 0, 0, 10)], + "c": [0.9092974268256817, -0.7568024953079282], + } + ), + ) diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index f1b4204661395..27a2f83e00d6d 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -1358,9 +1358,13 @@ def test_head_tail(fruits_cars: pl.DataFrame) -> None: def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None: res_expr = fruits_cars.select(pl.col("A").lower_bound()) - assert res_expr["A"][0] < -10_000_000 - res_expr = fruits_cars.select(pl.col("A").upper_bound()) - assert res_expr["A"][0] > 10_000_000 + assert res_expr.item() == -9223372036854775808 + + res_expr = fruits_cars.select(pl.col("B").upper_bound()) + assert res_expr.item() == 9223372036854775807 + + with pytest.raises(pl.ComputeError): + fruits_cars.select(pl.col("fruits").upper_bound()) def test_nested_min_max() -> None: diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/test_series.py index c79185ad11499..76155de4c1770 100644 --- a/py-polars/tests/unit/test_series.py +++ b/py-polars/tests/unit/test_series.py @@ -2420,3 +2420,26 @@ def test_map_dict() -> None: s.map_dict(remap, default=s.cast(pl.Utf8)), pl.Series("s", ["-1", "two", None, "four", "-5"]), ) + + +@pytest.mark.parametrize( + ("dtype", "lower", "upper"), + [ + (pl.Int8, -128, 127), + (pl.UInt8, 0, 255), + (pl.Int16, -32768, 32767), + (pl.UInt16, 0, 65535), + (pl.Int32, -2147483648, 2147483647), + (pl.UInt32, 0, 4294967295), + (pl.Int64, -9223372036854775808, 9223372036854775807), + (pl.UInt64, 0, 18446744073709551615), + (pl.Float32, float("-inf"), float("inf")), + (pl.Float64, float("-inf"), float("inf")), + ], +) +def test_upper_lower_bounds( + dtype: PolarsDataType, upper: int | float, lower: int | float +) -> None: + s = pl.Series("s", dtype=dtype) + assert s.lower_bound().item() == lower + assert s.upper_bound().item() == upper From 5e4a34599effeec6b9faa63e5758375655181611 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 18 Feb 2023 16:39:37 +0100 Subject: [PATCH 07/10] feat(python): More ergonomic `over` args (#6986) --- py-polars/polars/internals/dataframe/frame.py | 4 +- py-polars/polars/internals/expr/expr.py | 132 +++++++++++------- py-polars/polars/internals/lazyframe/frame.py | 4 +- .../tests/unit/operations/test_window.py | 24 ++++ 4 files changed, 111 insertions(+), 53 deletions(-) diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index 8a971c9a622c7..56e0c21edbb87 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -3596,8 +3596,8 @@ def groupby( Parameters ---------- by - Column or columns to group by. Accepts expression input. Strings are parsed - as column names. + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. *more_by Additional columns to group by, specified as positional arguments. maintain_order diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py index 25a7a16f09f52..52ee0f3f0e19f 100644 --- a/py-polars/polars/internals/expr/expr.py +++ b/py-polars/polars/internals/expr/expr.py @@ -2768,68 +2768,102 @@ def last(self) -> Self: """ return self._from_pyexpr(self._pyexpr.last()) - def over(self, expr: str | Expr | list[Expr | str]) -> Self: + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: """ - Apply window function over a subgroup. + Compute expressions over the given groups. - This is similar to a groupby + aggregation + self join. - Or similar to `window functions in Postgres - `_. + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. Parameters ---------- expr - Column(s) to group by. + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. Examples -------- + Pass the name of a column to compute the expression over that column. + >>> df = pl.DataFrame( ... { - ... "groups": ["g1", "g1", "g2"], - ... "values": [1, 2, 3], - ... } - ... ) - >>> df.with_columns(pl.col("values").max().over("groups").alias("max_by_group")) - shape: (3, 3) - ┌────────┬────────┬──────────────┐ - │ groups ┆ values ┆ max_by_group │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞════════╪════════╪══════════════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g1 ┆ 2 ┆ 2 │ - │ g2 ┆ 3 ┆ 3 │ - └────────┴────────┴──────────────┘ - >>> df = pl.DataFrame( - ... { - ... "groups": [1, 1, 2, 2, 1, 2, 3, 3, 1], - ... "values": [1, 2, 3, 4, 5, 6, 7, 8, 8], + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], ... } ... ) - >>> df.lazy().select( - ... pl.col("groups").sum().over("groups"), - ... ).collect() - shape: (9, 1) - ┌────────┐ - │ groups │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 4 │ - │ 4 │ - │ 6 │ - │ 6 │ - │ ... │ - │ 6 │ - │ 6 │ - │ 6 │ - │ 4 │ - └────────┘ - - """ - pyexprs = selection_to_pyexpr_list(expr) - - return self._from_pyexpr(self._pyexpr.over(pyexprs)) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + """ + exprs = selection_to_pyexpr_list(expr) + exprs.extend(selection_to_pyexpr_list(more_exprs)) + return self._from_pyexpr(self._pyexpr.over(exprs)) def is_unique(self) -> Self: """ diff --git a/py-polars/polars/internals/lazyframe/frame.py b/py-polars/polars/internals/lazyframe/frame.py index ec25c78be22c2..1dc9b520f50e8 100644 --- a/py-polars/polars/internals/lazyframe/frame.py +++ b/py-polars/polars/internals/lazyframe/frame.py @@ -1694,8 +1694,8 @@ def groupby( Parameters ---------- by - Column or columns to group by. Accepts expression input. Strings are parsed - as column names. + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. *more_by Additional columns to group by, specified as positional arguments. maintain_order diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py index fe2e0792d9215..c786f4d2cee24 100644 --- a/py-polars/tests/unit/operations/test_window.py +++ b/py-polars/tests/unit/operations/test_window.py @@ -7,6 +7,30 @@ from polars.testing import assert_frame_equal, assert_series_equal +def test_over_args() -> None: + df = pl.DataFrame( + { + "a": ["a", "a", "b"], + "b": [1, 2, 3], + "c": [3, 2, 1], + } + ) + + # Single input + expected = pl.Series("c", [3, 3, 1]).to_frame() + result = df.select(pl.col("c").max().over("a")) + assert_frame_equal(result, expected) + + # Multiple input as list + expected = pl.Series("c", [3, 2, 1]).to_frame() + result = df.select(pl.col("c").max().over(["a", "b"])) + assert_frame_equal(result, expected) + + # Multiple input as positional args + result = df.select(pl.col("c").max().over("a", "b")) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [pl.Float32, pl.Float64, pl.Int32]) def test_std(dtype: type[pl.DataType]) -> None: if dtype == pl.Int32: From d28842b849fa1114697875c2d2cbba58a4c4961d Mon Sep 17 00:00:00 2001 From: papparapa <37232476+papparapa@users.noreply.github.com> Date: Sun, 19 Feb 2023 00:40:17 +0900 Subject: [PATCH 08/10] fix(rust, python): dtype of pow function (#6985) --- .../src/dsl/function_expr/schema.rs | 3 +- py-polars/tests/unit/test_lazy.py | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs index bd8be2b62324a..81dafbc72289d 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs @@ -33,7 +33,6 @@ impl FunctionExpr { Ok(fld) }; - #[cfg(any(feature = "rolling_window", feature = "trigonometry", feature = "log"))] // set float supertype let float_dtype = || { map_dtype(&|dtype| match dtype { @@ -108,7 +107,7 @@ impl FunctionExpr { use FunctionExpr::*; match self { NullCount => with_dtype(IDX_DTYPE), - Pow => super_type(), + Pow => float_dtype(), Coalesce => super_type(), #[cfg(feature = "row_hash")] Hash(..) => with_dtype(DataType::UInt64), diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 27a2f83e00d6d..a6338780c95f3 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -1698,3 +1698,31 @@ def test_col() -> None: "hamburger", "foo", ] + + +def test_compare_schema_between_lazy_and_eager_6904() -> None: + float32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Float32)}) + eager_result = float32_df.select(pl.col("x").sqrt()).select(pl.col(pl.Float32)) + lazy_result = ( + float32_df.lazy() + .select(pl.col("x").sqrt()) + .select(pl.col(pl.Float32)) + .collect() + ) + assert eager_result.shape == lazy_result.shape + + eager_result = float32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float32)) + lazy_result = ( + float32_df.lazy() + .select(pl.col("x").pow(2)) + .select(pl.col(pl.Float32)) + .collect() + ) + assert eager_result.shape == lazy_result.shape + + int32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Int32)}) + eager_result = int32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float64)) + lazy_result = ( + int32_df.lazy().select(pl.col("x").pow(2)).select(pl.col(pl.Float64)).collect() + ) + assert eager_result.shape == lazy_result.shape From b4b1893c9ce048c377f281046157a4609ca7c037 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 18 Feb 2023 16:40:28 +0100 Subject: [PATCH 09/10] fix(rust, python): fix fill_null for categoricals (#6998) --- polars/polars-core/src/chunked_array/cast.rs | 23 +++++++++++++++++++- py-polars/tests/unit/test_df.py | 5 +++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/polars/polars-core/src/chunked_array/cast.rs b/polars/polars-core/src/chunked_array/cast.rs index 89c52a19e6136..345798a0c41b6 100644 --- a/polars/polars-core/src/chunked_array/cast.rs +++ b/polars/polars-core/src/chunked_array/cast.rs @@ -122,7 +122,28 @@ where } fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult { - self.cast_impl(data_type, false) + match data_type { + #[cfg(feature = "dtype-categorical")] + DataType::Categorical(Some(rev_map)) => { + if self.dtype() == &DataType::UInt32 { + // safety: + // we are guarded by the type system. + let ca = unsafe { &*(self as *const ChunkedArray as *const UInt32Chunked) }; + Ok(unsafe { + CategoricalChunked::from_cats_and_rev_map_unchecked( + ca.clone(), + rev_map.clone(), + ) + } + .into_series()) + } else { + Err(PolarsError::ComputeError( + "Cannot cast numeric types to 'Categorical'".into(), + )) + } + } + _ => self.cast_impl(data_type, false), + } } } diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py index c496a53c25dab..5e6db830688d8 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/test_df.py @@ -1862,6 +1862,11 @@ def test_fill_null() -> None: ], "b_backward": ["Apple", "Orange", "Carrot", "Carrot", None, None], } + # categoricals + df = pl.DataFrame(pl.Series("cat", ["a", None], dtype=pl.Categorical)) + s = df.select(pl.col("cat").fill_null(strategy="forward"))["cat"] + assert s.dtype == pl.Categorical + assert s.to_list() == ["a", "a"] def test_fill_nan() -> None: From b7178f185e17a34b518c5bff2a6b7c1a5d073c22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= <91746947+ozgrakkurt@users.noreply.github.com> Date: Sat, 18 Feb 2023 18:45:14 +0300 Subject: [PATCH 10/10] chore(rust): remove time 0.1 dep (#6979) --- polars/polars-arrow/Cargo.toml | 2 +- polars/polars-core/Cargo.toml | 2 +- polars/polars-io/Cargo.toml | 2 +- polars/polars-time/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/polars/polars-arrow/Cargo.toml b/polars/polars-arrow/Cargo.toml index 9a50f35846733..ed549b99aede8 100644 --- a/polars/polars-arrow/Cargo.toml +++ b/polars/polars-arrow/Cargo.toml @@ -10,7 +10,7 @@ description = "Arrow interfaces for Polars DataFrame library" [dependencies] arrow.workspace = true -chrono = { version = "0.4", optional = true } +chrono = { version = "0.4", default-features = false, features = ["std"], optional = true } chrono-tz = { version = "0.8", optional = true } hashbrown.workspace = true num.workspace = true diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index 9bae82c3b69b8..72b4d7e5def1a 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -155,7 +155,7 @@ ahash.workspace = true anyhow.workspace = true arrow.workspace = true bitflags.workspace = true -chrono = { version = "0.4", optional = true } +chrono = { version = "0.4", default-features = false, features = ["std"], optional = true } chrono-tz = { version = "0.8", optional = true } comfy-table = { version = "6.1.4", optional = true, default_features = false } hashbrown.workspace = true diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml index a203e7a8911df..6212e14d21990 100644 --- a/polars/polars-io/Cargo.toml +++ b/polars/polars-io/Cargo.toml @@ -55,7 +55,7 @@ anyhow.workspace = true arrow.workspace = true async-trait = { version = "0.1.59", optional = true } bytes = "1.3.0" -chrono = { version = "0.4.23", optional = true } +chrono = { version = "0.4", default-features = false, features = ["std"], optional = true } chrono-tz = { version = "0.8.1", optional = true } dirs = "4.0" flate2 = { version = "1", optional = true, default-features = false } diff --git a/polars/polars-time/Cargo.toml b/polars/polars-time/Cargo.toml index 84d129434b0da..962d85a50fc80 100644 --- a/polars/polars-time/Cargo.toml +++ b/polars/polars-time/Cargo.toml @@ -9,7 +9,7 @@ description = "Time related code for the polars dataframe library" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -chrono = "0.4" +chrono = { version = "0.4", default-features = false, features = ["std"] } chrono-tz = { version = "0.8", optional = true } lexical = { version = "6", default-features = false, features = ["std", "parse-floats", "parse-integers"] } now = "0.1"